Skip to content

PartOfSpeech

Extract Topic Keywords based on their Part-of-Speech.

DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ]

From candidate topics, as extracted with c-TF-IDF, find documents that contain keywords found in the candidate topics. These candidate documents then serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic.

These candidate keywords are first judged by whether they fall within the DEFAULT_PATTERNS or the user-defined pattern. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

Parameters:

Name Type Description Default
model Union[str, spacy.language.Language]

The Spacy model to use

'en_core_web_sm'
top_n_words int

The top n words to extract

10
pos_patterns List[str]

Patterns for Spacy to use. See https://spacy.io/usage/rule-based-matching

None

Usage:

from bertopic.representation import PartOfSpeech
from bertopic import BERTopic

# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)

You can define custom POS patterns to be extracted:

pos_patterns = [
            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
]
representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
Source code in bertopic\representation\_pos.py
class PartOfSpeech(BaseRepresentation):
    """Extract Topic Keywords based on their Part-of-Speech.

    DEFAULT_PATTERNS = [
                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                [{'POS': 'NOUN'}],
                [{'POS': 'ADJ'}]
    ]

    From candidate topics, as extracted with c-TF-IDF,
    find documents that contain keywords found in the
    candidate topics. These candidate documents then
    serve as the representative set of documents from
    which the Spacy model can extract a set of candidate
    keywords for each topic.

    These candidate keywords are first judged by whether
    they fall within the DEFAULT_PATTERNS or the user-defined
    pattern. Then, the resulting keywords are sorted by
    their respective c-TF-IDF values.

    Arguments:
        model: The Spacy model to use
        top_n_words: The top n words to extract
        pos_patterns: Patterns for Spacy to use.
                      See https://spacy.io/usage/rule-based-matching

    Usage:

    ```python
    from bertopic.representation import PartOfSpeech
    from bertopic import BERTopic

    # Create your representation model
    representation_model = PartOfSpeech("en_core_web_sm")

    # Use the representation model in BERTopic on top of the default pipeline
    topic_model = BERTopic(representation_model=representation_model)
    ```

    You can define custom POS patterns to be extracted:

    ```python
    pos_patterns = [
                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
    ]
    representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
    ```
    """

    def __init__(
        self,
        model: Union[str, Language] = "en_core_web_sm",
        top_n_words: int = 10,
        pos_patterns: List[str] = None,
    ):
        if isinstance(model, str):
            self.model = spacy.load(model)
        elif isinstance(model, Language):
            self.model = model
        else:
            raise ValueError(
                "Make sure that the Spacy model that you"
                "pass is either a string referring to a"
                "Spacy model or a Spacy nlp object."
            )

        self.top_n_words = top_n_words

        if pos_patterns is None:
            self.pos_patterns = [
                [{"POS": "ADJ"}, {"POS": "NOUN"}],
                [{"POS": "NOUN"}],
                [{"POS": "ADJ"}],
            ]
        else:
            self.pos_patterns = pos_patterns

    def extract_topics(
        self,
        topic_model,
        documents: pd.DataFrame,
        c_tf_idf: csr_matrix,
        topics: Mapping[str, List[Tuple[str, float]]],
    ) -> Mapping[str, List[Tuple[str, float]]]:
        """Extract topics.

        Arguments:
            topic_model: A BERTopic model
            documents: All input documents
            c_tf_idf: Not used
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            updated_topics: Updated topic representations
        """
        matcher = Matcher(self.model.vocab)
        matcher.add("Pattern", self.pos_patterns)

        candidate_topics = {}
        for topic, values in topics.items():
            keywords = list(zip(*values))[0]

            # Extract candidate documents
            candidate_documents = []
            for keyword in keywords:
                selection = documents.loc[documents.Topic == topic, :]
                selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
                if len(selection) > 0:
                    for document in selection[:2]:
                        candidate_documents.append(document)
            candidate_documents = list(set(candidate_documents))

            # Extract keywords
            docs_pipeline = self.model.pipe(candidate_documents)
            updated_keywords = []
            for doc in docs_pipeline:
                matches = matcher(doc)
                for _, start, end in matches:
                    updated_keywords.append(doc[start:end].text)
            candidate_topics[topic] = list(set(updated_keywords))

        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
        # and will be removed in 1.2. Please use get_feature_names_out instead.
        if version.parse(sklearn_version) >= version.parse("1.0.0"):
            words = list(topic_model.vectorizer_model.get_feature_names_out())
        else:
            words = list(topic_model.vectorizer_model.get_feature_names())

        # Match updated keywords with c-TF-IDF values
        words_lookup = dict(zip(words, range(len(words))))
        updated_topics = {topic: [] for topic in topics.keys()}

        for topic, candidate_keywords in candidate_topics.items():
            word_indices = np.sort(
                [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup]
            )
            vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]
            indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
            vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
            topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
            updated_topics[topic] = topic_words
            if len(updated_topics[topic]) < self.top_n_words:
                updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))]

        return updated_topics

extract_topics(self, topic_model, documents, c_tf_idf, topics)

Extract topics.

Parameters:

Name Type Description Default
topic_model

A BERTopic model

required
documents DataFrame

All input documents

required
c_tf_idf csr_matrix

Not used

required
topics Mapping[str, List[Tuple[str, float]]]

The candidate topics as calculated with c-TF-IDF

required

Returns:

Type Description
updated_topics

Updated topic representations

Source code in bertopic\representation\_pos.py
def extract_topics(
    self,
    topic_model,
    documents: pd.DataFrame,
    c_tf_idf: csr_matrix,
    topics: Mapping[str, List[Tuple[str, float]]],
) -> Mapping[str, List[Tuple[str, float]]]:
    """Extract topics.

    Arguments:
        topic_model: A BERTopic model
        documents: All input documents
        c_tf_idf: Not used
        topics: The candidate topics as calculated with c-TF-IDF

    Returns:
        updated_topics: Updated topic representations
    """
    matcher = Matcher(self.model.vocab)
    matcher.add("Pattern", self.pos_patterns)

    candidate_topics = {}
    for topic, values in topics.items():
        keywords = list(zip(*values))[0]

        # Extract candidate documents
        candidate_documents = []
        for keyword in keywords:
            selection = documents.loc[documents.Topic == topic, :]
            selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
            if len(selection) > 0:
                for document in selection[:2]:
                    candidate_documents.append(document)
        candidate_documents = list(set(candidate_documents))

        # Extract keywords
        docs_pipeline = self.model.pipe(candidate_documents)
        updated_keywords = []
        for doc in docs_pipeline:
            matches = matcher(doc)
            for _, start, end in matches:
                updated_keywords.append(doc[start:end].text)
        candidate_topics[topic] = list(set(updated_keywords))

    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
    # and will be removed in 1.2. Please use get_feature_names_out instead.
    if version.parse(sklearn_version) >= version.parse("1.0.0"):
        words = list(topic_model.vectorizer_model.get_feature_names_out())
    else:
        words = list(topic_model.vectorizer_model.get_feature_names())

    # Match updated keywords with c-TF-IDF values
    words_lookup = dict(zip(words, range(len(words))))
    updated_topics = {topic: [] for topic in topics.keys()}

    for topic, candidate_keywords in candidate_topics.items():
        word_indices = np.sort(
            [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup]
        )
        vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]
        indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
        vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]
        topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
        updated_topics[topic] = topic_words
        if len(updated_topics[topic]) < self.top_n_words:
            updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))]

    return updated_topics