Skip to content

MaximalMarginalRelevance

Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document.

MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

Parameters:

Name Type Description Default
diversity float

How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse.

0.1
top_n_words int

The number of keywords/keyhprases to return

10

Usage:

from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic

# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.3)

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
Source code in bertopic\representation\_mmr.py
class MaximalMarginalRelevance(BaseRepresentation):
    """ Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.

    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.

    Arguments:
        diversity: How diverse the select keywords/keyphrases are.
                    Values range between 0 and 1 with 0 being not diverse at all
                    and 1 being most diverse.
        top_n_words: The number of keywords/keyhprases to return

    Usage:

    ```python
    from bertopic.representation import MaximalMarginalRelevance
    from bertopic import BERTopic

    # Create your representation model
    representation_model = MaximalMarginalRelevance(diversity=0.3)

    # Use the representation model in BERTopic on top of the default pipeline
    topic_model = BERTopic(representation_model=representation_model)
    ```
    """
    def __init__(self, diversity: float = 0.1, top_n_words: int = 10):
        self.diversity = diversity
        self.top_n_words = top_n_words

    def extract_topics(self,
                       topic_model,
                       documents: pd.DataFrame,
                       c_tf_idf: csr_matrix,
                       topics: Mapping[str, List[Tuple[str, float]]]
                       ) -> Mapping[str, List[Tuple[str, float]]]:
        """ Extract topic representations

        Arguments:
            topic_model: The BERTopic model
            documents: Not used
            c_tf_idf: Not used
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            updated_topics: Updated topic representations
        """

        if topic_model.embedding_model is None:
            warnings.warn("MaximalMarginalRelevance can only be used BERTopic was instantiated"
                          "with the `embedding_model` parameter.")
            return topics

        updated_topics = {}
        for topic, topic_words in topics.items():
            words = [word[0] for word in topic_words]
            word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
            topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(1, -1)
            topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words)
            updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
        return updated_topics

extract_topics(self, topic_model, documents, c_tf_idf, topics)

Extract topic representations

Parameters:

Name Type Description Default
topic_model

The BERTopic model

required
documents DataFrame

Not used

required
c_tf_idf csr_matrix

Not used

required
topics Mapping[str, List[Tuple[str, float]]]

The candidate topics as calculated with c-TF-IDF

required

Returns:

Type Description
updated_topics

Updated topic representations

Source code in bertopic\representation\_mmr.py
def extract_topics(self,
                   topic_model,
                   documents: pd.DataFrame,
                   c_tf_idf: csr_matrix,
                   topics: Mapping[str, List[Tuple[str, float]]]
                   ) -> Mapping[str, List[Tuple[str, float]]]:
    """ Extract topic representations

    Arguments:
        topic_model: The BERTopic model
        documents: Not used
        c_tf_idf: Not used
        topics: The candidate topics as calculated with c-TF-IDF

    Returns:
        updated_topics: Updated topic representations
    """

    if topic_model.embedding_model is None:
        warnings.warn("MaximalMarginalRelevance can only be used BERTopic was instantiated"
                      "with the `embedding_model` parameter.")
        return topics

    updated_topics = {}
    for topic, topic_words in topics.items():
        words = [word[0] for word in topic_words]
        word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
        topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(1, -1)
        topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words)
        updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
    return updated_topics