Skip to content

KeyLLM

A minimal method for keyword extraction with Large Language Models (LLM).

The keyword extraction is done by simply asking the LLM to extract a number of keywords from a single piece of text.

Source code in keybert\_llm.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
class KeyLLM:
    """A minimal method for keyword extraction with Large Language Models (LLM).

    The keyword extraction is done by simply asking the LLM to extract a
    number of keywords from a single piece of text.
    """

    def __init__(self, llm):
        """KeyBERT initialization.

        Arguments:
            llm: The Large Language Model to use
        """
        self.llm = llm

    def extract_keywords(
        self,
        docs: Union[str, List[str]],
        check_vocab: bool = False,
        candidate_keywords: List[List[str]] = None,
        threshold: float = None,
        embeddings=None,
    ) -> Union[List[str], List[List[str]]]:
        """Extract keywords and/or keyphrases.

        To get the biggest speed-up, make sure to pass multiple documents
        at once instead of iterating over a single document.

        NOTE: The resulting keywords are expected to be separated by commas so
        any changes to the prompt will have to make sure that the resulting
        keywords are comma-separated.

        Arguments:
            docs: The document(s) for which to extract keywords/keyphrases
            check_vocab: Only return keywords that appear exactly in the documents
            candidate_keywords: Candidate keywords for each document
            threshold: Minimum similarity value between 0 and 1 used to decide how similar documents need to receive the same keywords.
            embeddings: The embeddings of each document.

        Returns:
            keywords: The top n keywords for a document with their respective distances
                      to the input document.

        Usage:

        To extract keywords from a single document:

        ```python
        import openai
        from keybert.llm import OpenAI
        from keybert import KeyLLM

        # Create your LLM
        client = openai.OpenAI(api_key=MY_API_KEY)
        llm = OpenAI(client)

        # Load it in KeyLLM
        kw_model = KeyLLM(llm)

        # Extract keywords
        document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
        keywords = kw_model.extract_keywords(document)
        ```
        """
        # Check for a single, empty document
        if isinstance(docs, str):
            if docs:
                docs = [docs]
            else:
                return []

        if HAS_SBERT and threshold is not None and embeddings is not None:
            # Find similar documents
            clusters = util.community_detection(embeddings, min_community_size=2, threshold=threshold)
            in_cluster = set([cluster for cluster_set in clusters for cluster in cluster_set])
            out_cluster = set(list(range(len(docs)))).difference(in_cluster)

            # Extract keywords for all documents not in a cluster
            if out_cluster:
                selected_docs = [docs[index] for index in out_cluster]
                if candidate_keywords is not None:
                    selected_keywords = [candidate_keywords[index] for index in out_cluster]
                else:
                    selected_keywords = None
                out_cluster_keywords = self.llm.extract_keywords(
                    selected_docs,
                    selected_keywords,
                )
                out_cluster_keywords = {index: words for words, index in zip(out_cluster_keywords, out_cluster)}

            # Extract keywords for only the first document in a cluster
            if in_cluster:
                selected_docs = [docs[cluster[0]] for cluster in clusters]
                if candidate_keywords is not None:
                    selected_keywords = [candidate_keywords[cluster[0]] for cluster in clusters]
                else:
                    selected_keywords = None
                in_cluster_keywords = self.llm.extract_keywords(selected_docs, selected_keywords)
                in_cluster_keywords = {
                    doc_id: in_cluster_keywords[index] for index, cluster in enumerate(clusters) for doc_id in cluster
                }

            # Update out cluster keywords with in cluster keywords
            if out_cluster:
                if in_cluster:
                    out_cluster_keywords.update(in_cluster_keywords)
                keywords = [out_cluster_keywords[index] for index in range(len(docs))]
            else:
                keywords = [in_cluster_keywords[index] for index in range(len(docs))]
        else:
            # Extract keywords using a Large Language Model (LLM)
            keywords = self.llm.extract_keywords(docs, candidate_keywords)

        # Only extract keywords that appear in the input document
        if check_vocab:
            updated_keywords = []
            for keyword_set, document in zip(keywords, docs):
                updated_keyword_set = []
                for keyword in keyword_set:
                    if keyword in document:
                        updated_keyword_set.append(keyword)
                updated_keywords.append(updated_keyword_set)
            return updated_keywords

        return keywords

__init__(llm)

KeyBERT initialization.

Parameters:

Name Type Description Default
llm

The Large Language Model to use

required
Source code in keybert\_llm.py
18
19
20
21
22
23
24
def __init__(self, llm):
    """KeyBERT initialization.

    Arguments:
        llm: The Large Language Model to use
    """
    self.llm = llm

extract_keywords(docs, check_vocab=False, candidate_keywords=None, threshold=None, embeddings=None)

Extract keywords and/or keyphrases.

To get the biggest speed-up, make sure to pass multiple documents at once instead of iterating over a single document.

NOTE: The resulting keywords are expected to be separated by commas so any changes to the prompt will have to make sure that the resulting keywords are comma-separated.

Parameters:

Name Type Description Default
docs Union[str, List[str]]

The document(s) for which to extract keywords/keyphrases

required
check_vocab bool

Only return keywords that appear exactly in the documents

False
candidate_keywords List[List[str]]

Candidate keywords for each document

None
threshold float

Minimum similarity value between 0 and 1 used to decide how similar documents need to receive the same keywords.

None
embeddings

The embeddings of each document.

None

Returns:

Name Type Description
keywords Union[List[str], List[List[str]]]

The top n keywords for a document with their respective distances to the input document.

Usage:

To extract keywords from a single document:

import openai
from keybert.llm import OpenAI
from keybert import KeyLLM

# Create your LLM
client = openai.OpenAI(api_key=MY_API_KEY)
llm = OpenAI(client)

# Load it in KeyLLM
kw_model = KeyLLM(llm)

# Extract keywords
document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
keywords = kw_model.extract_keywords(document)
Source code in keybert\_llm.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def extract_keywords(
    self,
    docs: Union[str, List[str]],
    check_vocab: bool = False,
    candidate_keywords: List[List[str]] = None,
    threshold: float = None,
    embeddings=None,
) -> Union[List[str], List[List[str]]]:
    """Extract keywords and/or keyphrases.

    To get the biggest speed-up, make sure to pass multiple documents
    at once instead of iterating over a single document.

    NOTE: The resulting keywords are expected to be separated by commas so
    any changes to the prompt will have to make sure that the resulting
    keywords are comma-separated.

    Arguments:
        docs: The document(s) for which to extract keywords/keyphrases
        check_vocab: Only return keywords that appear exactly in the documents
        candidate_keywords: Candidate keywords for each document
        threshold: Minimum similarity value between 0 and 1 used to decide how similar documents need to receive the same keywords.
        embeddings: The embeddings of each document.

    Returns:
        keywords: The top n keywords for a document with their respective distances
                  to the input document.

    Usage:

    To extract keywords from a single document:

    ```python
    import openai
    from keybert.llm import OpenAI
    from keybert import KeyLLM

    # Create your LLM
    client = openai.OpenAI(api_key=MY_API_KEY)
    llm = OpenAI(client)

    # Load it in KeyLLM
    kw_model = KeyLLM(llm)

    # Extract keywords
    document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
    keywords = kw_model.extract_keywords(document)
    ```
    """
    # Check for a single, empty document
    if isinstance(docs, str):
        if docs:
            docs = [docs]
        else:
            return []

    if HAS_SBERT and threshold is not None and embeddings is not None:
        # Find similar documents
        clusters = util.community_detection(embeddings, min_community_size=2, threshold=threshold)
        in_cluster = set([cluster for cluster_set in clusters for cluster in cluster_set])
        out_cluster = set(list(range(len(docs)))).difference(in_cluster)

        # Extract keywords for all documents not in a cluster
        if out_cluster:
            selected_docs = [docs[index] for index in out_cluster]
            if candidate_keywords is not None:
                selected_keywords = [candidate_keywords[index] for index in out_cluster]
            else:
                selected_keywords = None
            out_cluster_keywords = self.llm.extract_keywords(
                selected_docs,
                selected_keywords,
            )
            out_cluster_keywords = {index: words for words, index in zip(out_cluster_keywords, out_cluster)}

        # Extract keywords for only the first document in a cluster
        if in_cluster:
            selected_docs = [docs[cluster[0]] for cluster in clusters]
            if candidate_keywords is not None:
                selected_keywords = [candidate_keywords[cluster[0]] for cluster in clusters]
            else:
                selected_keywords = None
            in_cluster_keywords = self.llm.extract_keywords(selected_docs, selected_keywords)
            in_cluster_keywords = {
                doc_id: in_cluster_keywords[index] for index, cluster in enumerate(clusters) for doc_id in cluster
            }

        # Update out cluster keywords with in cluster keywords
        if out_cluster:
            if in_cluster:
                out_cluster_keywords.update(in_cluster_keywords)
            keywords = [out_cluster_keywords[index] for index in range(len(docs))]
        else:
            keywords = [in_cluster_keywords[index] for index in range(len(docs))]
    else:
        # Extract keywords using a Large Language Model (LLM)
        keywords = self.llm.extract_keywords(docs, candidate_keywords)

    # Only extract keywords that appear in the input document
    if check_vocab:
        updated_keywords = []
        for keyword_set, document in zip(keywords, docs):
            updated_keyword_set = []
            for keyword in keyword_set:
                if keyword in document:
                    updated_keyword_set.append(keyword)
            updated_keywords.append(updated_keyword_set)
        return updated_keywords

    return keywords