KeyLLM
¶
A minimal method for keyword extraction with Large Language Models (LLM)
The keyword extraction is done by simply asking the LLM to extract a number of keywords from a single piece of text.
Source code in keybert\_llm.py
class KeyLLM:
"""
A minimal method for keyword extraction with Large Language Models (LLM)
The keyword extraction is done by simply asking the LLM to extract a
number of keywords from a single piece of text.
"""
def __init__(self, llm):
"""KeyBERT initialization
Arguments:
llm: The Large Language Model to use
"""
self.llm = llm
def extract_keywords(
self,
docs: Union[str, List[str]],
check_vocab: bool = False,
candidate_keywords: List[List[str]] = None,
threshold: float = None,
embeddings=None
) -> Union[List[str], List[List[str]]]:
"""Extract keywords and/or keyphrases
To get the biggest speed-up, make sure to pass multiple documents
at once instead of iterating over a single document.
NOTE: The resulting keywords are expected to be separated by commas so
any changes to the prompt will have to make sure that the resulting
keywords are comma-separated.
Arguments:
docs: The document(s) for which to extract keywords/keyphrases
check_vocab: Only return keywords that appear exactly in the documents
candidate_keywords: Candidate keywords for each document
Returns:
keywords: The top n keywords for a document with their respective distances
to the input document.
Usage:
To extract keywords from a single document:
```python
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM
# Create your LLM
openai.api_key = "sk-..."
llm = OpenAI()
# Load it in KeyLLM
kw_model = KeyLLM(llm)
# Extract keywords
document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
keywords = kw_model.extract_keywords(document)
```
"""
# Check for a single, empty document
if isinstance(docs, str):
if docs:
docs = [docs]
else:
return []
if HAS_SBERT and threshold is not None and embeddings is not None:
# Find similar documents
clusters = util.community_detection(embeddings, min_community_size=2, threshold=threshold)
in_cluster = set([cluster for cluster_set in clusters for cluster in cluster_set])
out_cluster = set(list(range(len(docs)))).difference(in_cluster)
# Extract keywords for all documents not in a cluster
if out_cluster:
selected_docs = [docs[index] for index in out_cluster]
print(out_cluster, selected_docs)
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[index] for index in out_cluster]
else:
selected_keywords = None
print(f"Call LLM with {len(selected_docs)} docs; out-cluster")
out_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords,
)
out_cluster_keywords = {index: words for words, index in zip(out_cluster_keywords, out_cluster)}
# Extract keywords for only the first document in a cluster
if in_cluster:
selected_docs = [docs[cluster[0]] for cluster in clusters]
print(in_cluster, selected_docs)
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[cluster[0]] for cluster in in_cluster]
else:
selected_keywords = None
print(f"Call LLM with {len(selected_docs)} docs; in-cluster")
in_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords
)
in_cluster_keywords = {
doc_id: in_cluster_keywords[index]
for index, cluster in enumerate(clusters)
for doc_id in cluster
}
# Update out cluster keywords with in cluster keywords
if out_cluster:
if in_cluster:
out_cluster_keywords.update(in_cluster_keywords)
print(out_cluster_keywords)
keywords = [out_cluster_keywords[index] for index in range(len(docs))]
else:
keywords = [in_cluster_keywords[index] for index in range(len(docs))]
else:
# Extract keywords using a Large Language Model (LLM)
keywords = self.llm.extract_keywords(docs, candidate_keywords)
# Only extract keywords that appear in the input document
if check_vocab:
updated_keywords = []
for keyword_set, document in zip(keywords, docs):
updated_keyword_set = []
for keyword in keyword_set:
if keyword in document:
updated_keyword_set.append(keyword)
updated_keywords.append(updated_keyword_set)
return updated_keywords
return keywords
__init__(self, llm)
special
¶
KeyBERT initialization
Parameters:
Name | Type | Description | Default |
---|---|---|---|
llm |
The Large Language Model to use |
required |
Source code in keybert\_llm.py
def __init__(self, llm):
"""KeyBERT initialization
Arguments:
llm: The Large Language Model to use
"""
self.llm = llm
extract_keywords(self, docs, check_vocab=False, candidate_keywords=None, threshold=None, embeddings=None)
¶
Extract keywords and/or keyphrases
To get the biggest speed-up, make sure to pass multiple documents at once instead of iterating over a single document.
NOTE: The resulting keywords are expected to be separated by commas so any changes to the prompt will have to make sure that the resulting keywords are comma-separated.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
docs |
Union[str, List[str]] |
The document(s) for which to extract keywords/keyphrases |
required |
check_vocab |
bool |
Only return keywords that appear exactly in the documents |
False |
candidate_keywords |
List[List[str]] |
Candidate keywords for each document |
None |
Returns:
Type | Description |
---|---|
keywords |
The top n keywords for a document with their respective distances to the input document. |
Usage:
To extract keywords from a single document:
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM
# Create your LLM
openai.api_key = "sk-..."
llm = OpenAI()
# Load it in KeyLLM
kw_model = KeyLLM(llm)
# Extract keywords
document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
keywords = kw_model.extract_keywords(document)
Source code in keybert\_llm.py
def extract_keywords(
self,
docs: Union[str, List[str]],
check_vocab: bool = False,
candidate_keywords: List[List[str]] = None,
threshold: float = None,
embeddings=None
) -> Union[List[str], List[List[str]]]:
"""Extract keywords and/or keyphrases
To get the biggest speed-up, make sure to pass multiple documents
at once instead of iterating over a single document.
NOTE: The resulting keywords are expected to be separated by commas so
any changes to the prompt will have to make sure that the resulting
keywords are comma-separated.
Arguments:
docs: The document(s) for which to extract keywords/keyphrases
check_vocab: Only return keywords that appear exactly in the documents
candidate_keywords: Candidate keywords for each document
Returns:
keywords: The top n keywords for a document with their respective distances
to the input document.
Usage:
To extract keywords from a single document:
```python
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM
# Create your LLM
openai.api_key = "sk-..."
llm = OpenAI()
# Load it in KeyLLM
kw_model = KeyLLM(llm)
# Extract keywords
document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
keywords = kw_model.extract_keywords(document)
```
"""
# Check for a single, empty document
if isinstance(docs, str):
if docs:
docs = [docs]
else:
return []
if HAS_SBERT and threshold is not None and embeddings is not None:
# Find similar documents
clusters = util.community_detection(embeddings, min_community_size=2, threshold=threshold)
in_cluster = set([cluster for cluster_set in clusters for cluster in cluster_set])
out_cluster = set(list(range(len(docs)))).difference(in_cluster)
# Extract keywords for all documents not in a cluster
if out_cluster:
selected_docs = [docs[index] for index in out_cluster]
print(out_cluster, selected_docs)
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[index] for index in out_cluster]
else:
selected_keywords = None
print(f"Call LLM with {len(selected_docs)} docs; out-cluster")
out_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords,
)
out_cluster_keywords = {index: words for words, index in zip(out_cluster_keywords, out_cluster)}
# Extract keywords for only the first document in a cluster
if in_cluster:
selected_docs = [docs[cluster[0]] for cluster in clusters]
print(in_cluster, selected_docs)
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[cluster[0]] for cluster in in_cluster]
else:
selected_keywords = None
print(f"Call LLM with {len(selected_docs)} docs; in-cluster")
in_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords
)
in_cluster_keywords = {
doc_id: in_cluster_keywords[index]
for index, cluster in enumerate(clusters)
for doc_id in cluster
}
# Update out cluster keywords with in cluster keywords
if out_cluster:
if in_cluster:
out_cluster_keywords.update(in_cluster_keywords)
print(out_cluster_keywords)
keywords = [out_cluster_keywords[index] for index in range(len(docs))]
else:
keywords = [in_cluster_keywords[index] for index in range(len(docs))]
else:
# Extract keywords using a Large Language Model (LLM)
keywords = self.llm.extract_keywords(docs, candidate_keywords)
# Only extract keywords that appear in the input document
if check_vocab:
updated_keywords = []
for keyword_set, document in zip(keywords, docs):
updated_keyword_set = []
for keyword in keyword_set:
if keyword in document:
updated_keyword_set.append(keyword)
updated_keywords.append(updated_keyword_set)
return updated_keywords
return keywords