Skip to content

PartOfSpeech

Bases: BaseRepresentation

Extract Topic Keywords based on their Part-of-Speech

DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ]

From candidate topics, as extracted with c-TF-IDF, find documents that contain keywords found in the candidate topics. These candidate documents then serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic.

These candidate keywords are first judged by whether they fall within the DEFAULT_PATTERNS or the user-defined pattern. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

Parameters:

Name Type Description Default
model Union[str, Language]

The Spacy model to use

'en_core_web_sm'
top_n_words int

The top n words to extract

10
pos_patterns List[str]

Patterns for Spacy to use. See https://spacy.io/usage/rule-based-matching

None

Usage:

from bertopic.representation import PartOfSpeech
from bertopic import BERTopic

# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)

You can define custom POS patterns to be extracted:

pos_patterns = [
            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
]
representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
Source code in bertopic\representation\_pos.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class PartOfSpeech(BaseRepresentation):
    """ Extract Topic Keywords based on their Part-of-Speech

    DEFAULT_PATTERNS = [
                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                [{'POS': 'NOUN'}],
                [{'POS': 'ADJ'}]
    ]

    From candidate topics, as extracted with c-TF-IDF,
    find documents that contain keywords found in the
    candidate topics. These candidate documents then
    serve as the representative set of documents from
    which the Spacy model can extract a set of candidate
    keywords for each topic.

    These candidate keywords are first judged by whether
    they fall within the DEFAULT_PATTERNS or the user-defined
    pattern. Then, the resulting keywords are sorted by
    their respective c-TF-IDF values.

    Arguments:
        model: The Spacy model to use
        top_n_words: The top n words to extract
        pos_patterns: Patterns for Spacy to use.
                      See https://spacy.io/usage/rule-based-matching

    Usage:

    ```python
    from bertopic.representation import PartOfSpeech
    from bertopic import BERTopic

    # Create your representation model
    representation_model = PartOfSpeech("en_core_web_sm")

    # Use the representation model in BERTopic on top of the default pipeline
    topic_model = BERTopic(representation_model=representation_model)
    ```

    You can define custom POS patterns to be extracted:

    ```python
    pos_patterns = [
                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
    ]
    representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
    ```
    """
    def __init__(self,
                 model: Union[str, Language] = "en_core_web_sm",
                 top_n_words: int = 10,
                 pos_patterns: List[str] = None):
        if isinstance(model, str):
            self.model = spacy.load(model)
        elif isinstance(model, Language):
            self.model = model
        else:
            raise ValueError("Make sure that the Spacy model that you"
                             "pass is either a string referring to a"
                             "Spacy model or a Spacy nlp object.")

        self.top_n_words = top_n_words

        if pos_patterns is None:
            self.pos_patterns = [
                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
            ]
        else:
            self.pos_patterns = pos_patterns

    def extract_topics(self,
                       topic_model,
                       documents: pd.DataFrame,
                       c_tf_idf: csr_matrix,
                       topics: Mapping[str, List[Tuple[str, float]]]
                       ) -> Mapping[str, List[Tuple[str, float]]]:
        """ Extract topics

        Arguments:
            topic_model: A BERTopic model
            documents: All input documents
            c_tf_idf: Not used
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            updated_topics: Updated topic representations
        """
        matcher = Matcher(self.model.vocab)
        matcher.add("Pattern", self.pos_patterns)

        candidate_topics = {}
        for topic, values in topics.items():
            keywords = list(zip(*values))[0]

            # Extract candidate documents
            candidate_documents = []
            for keyword in keywords:
                selection = documents.loc[documents.Topic == topic, :]
                selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
                if len(selection) > 0:
                    for document in selection[:2]:
                        candidate_documents.append(document)
            candidate_documents = list(set(candidate_documents))

            # Extract keywords
            docs_pipeline = self.model.pipe(candidate_documents)
            updated_keywords = []
            for doc in docs_pipeline:
                matches = matcher(doc)
                for _, start, end in matches:
                    updated_keywords.append(doc[start:end].text)
            candidate_topics[topic] = list(set(updated_keywords))

        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
        # and will be removed in 1.2. Please use get_feature_names_out instead.
        if version.parse(sklearn_version) >= version.parse("1.0.0"):
            words = list(topic_model.vectorizer_model.get_feature_names_out())
        else:
            words = list(topic_model.vectorizer_model.get_feature_names())

        # Match updated keywords with c-TF-IDF values
        words_lookup = dict(zip(words, range(len(words))))
        updated_topics = {topic: [] for topic in topics.keys()}

        for topic, candidate_keywords in candidate_topics.items():
            word_indices = [words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)]
            vals = topic_model.c_tf_idf_[:, np.array(word_indices)][topic + topic_model._outliers]
            indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
            vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
            topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
            updated_topics[topic] = topic_words
            if len(updated_topics[topic]) < self.top_n_words:
                updated_topics[topic] += [("", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))]

        return updated_topics

extract_topics(topic_model, documents, c_tf_idf, topics)

Extract topics

Parameters:

Name Type Description Default
topic_model

A BERTopic model

required
documents DataFrame

All input documents

required
c_tf_idf csr_matrix

Not used

required
topics Mapping[str, List[Tuple[str, float]]]

The candidate topics as calculated with c-TF-IDF

required

Returns:

Name Type Description
updated_topics Mapping[str, List[Tuple[str, float]]]

Updated topic representations

Source code in bertopic\representation\_pos.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def extract_topics(self,
                   topic_model,
                   documents: pd.DataFrame,
                   c_tf_idf: csr_matrix,
                   topics: Mapping[str, List[Tuple[str, float]]]
                   ) -> Mapping[str, List[Tuple[str, float]]]:
    """ Extract topics

    Arguments:
        topic_model: A BERTopic model
        documents: All input documents
        c_tf_idf: Not used
        topics: The candidate topics as calculated with c-TF-IDF

    Returns:
        updated_topics: Updated topic representations
    """
    matcher = Matcher(self.model.vocab)
    matcher.add("Pattern", self.pos_patterns)

    candidate_topics = {}
    for topic, values in topics.items():
        keywords = list(zip(*values))[0]

        # Extract candidate documents
        candidate_documents = []
        for keyword in keywords:
            selection = documents.loc[documents.Topic == topic, :]
            selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
            if len(selection) > 0:
                for document in selection[:2]:
                    candidate_documents.append(document)
        candidate_documents = list(set(candidate_documents))

        # Extract keywords
        docs_pipeline = self.model.pipe(candidate_documents)
        updated_keywords = []
        for doc in docs_pipeline:
            matches = matcher(doc)
            for _, start, end in matches:
                updated_keywords.append(doc[start:end].text)
        candidate_topics[topic] = list(set(updated_keywords))

    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
    # and will be removed in 1.2. Please use get_feature_names_out instead.
    if version.parse(sklearn_version) >= version.parse("1.0.0"):
        words = list(topic_model.vectorizer_model.get_feature_names_out())
    else:
        words = list(topic_model.vectorizer_model.get_feature_names())

    # Match updated keywords with c-TF-IDF values
    words_lookup = dict(zip(words, range(len(words))))
    updated_topics = {topic: [] for topic in topics.keys()}

    for topic, candidate_keywords in candidate_topics.items():
        word_indices = [words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)]
        vals = topic_model.c_tf_idf_[:, np.array(word_indices)][topic + topic_model._outliers]
        indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
        vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
        topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
        updated_topics[topic] = topic_words
        if len(updated_topics[topic]) < self.top_n_words:
            updated_topics[topic] += [("", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))]

    return updated_topics