Skip to content

MaximalMarginalRelevance

Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document.

MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

Parameters:

Name Type Description Default
diversity float

How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse.

0.1
top_n_words int

The number of keywords/keyhprases to return

10

Usage:

from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic

# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.3)

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
Source code in bertopic\representation\_mmr.py
class MaximalMarginalRelevance(BaseRepresentation):
    """Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.

    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.

    Arguments:
        diversity: How diverse the select keywords/keyphrases are.
                    Values range between 0 and 1 with 0 being not diverse at all
                    and 1 being most diverse.
        top_n_words: The number of keywords/keyhprases to return

    Usage:

    ```python
    from bertopic.representation import MaximalMarginalRelevance
    from bertopic import BERTopic

    # Create your representation model
    representation_model = MaximalMarginalRelevance(diversity=0.3)

    # Use the representation model in BERTopic on top of the default pipeline
    topic_model = BERTopic(representation_model=representation_model)
    ```
    """

    def __init__(self, diversity: float = 0.1, top_n_words: int = 10):
        self.diversity = diversity
        self.top_n_words = top_n_words

    def extract_topics(
        self,
        topic_model,
        documents: pd.DataFrame,
        c_tf_idf: csr_matrix,
        topics: Mapping[str, List[Tuple[str, float]]],
    ) -> Mapping[str, List[Tuple[str, float]]]:
        """Extract topic representations.

        Arguments:
            topic_model: The BERTopic model
            documents: Not used
            c_tf_idf: Not used
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            updated_topics: Updated topic representations
        """
        if topic_model.embedding_model is None:
            warnings.warn(
                "MaximalMarginalRelevance can only be used BERTopic was instantiated"
                "with the `embedding_model` parameter."
            )
            return topics

        updated_topics = {}
        for topic, topic_words in topics.items():
            words = [word[0] for word in topic_words]
            word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
            topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
                1, -1
            )
            topic_words = mmr(
                topic_embedding,
                word_embeddings,
                words,
                self.diversity,
                self.top_n_words,
            )
            updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
        return updated_topics

extract_topics(self, topic_model, documents, c_tf_idf, topics)

Extract topic representations.

Parameters:

Name Type Description Default
topic_model

The BERTopic model

required
documents DataFrame

Not used

required
c_tf_idf csr_matrix

Not used

required
topics Mapping[str, List[Tuple[str, float]]]

The candidate topics as calculated with c-TF-IDF

required

Returns:

Type Description
updated_topics

Updated topic representations

Source code in bertopic\representation\_mmr.py
def extract_topics(
    self,
    topic_model,
    documents: pd.DataFrame,
    c_tf_idf: csr_matrix,
    topics: Mapping[str, List[Tuple[str, float]]],
) -> Mapping[str, List[Tuple[str, float]]]:
    """Extract topic representations.

    Arguments:
        topic_model: The BERTopic model
        documents: Not used
        c_tf_idf: Not used
        topics: The candidate topics as calculated with c-TF-IDF

    Returns:
        updated_topics: Updated topic representations
    """
    if topic_model.embedding_model is None:
        warnings.warn(
            "MaximalMarginalRelevance can only be used BERTopic was instantiated"
            "with the `embedding_model` parameter."
        )
        return topics

    updated_topics = {}
    for topic, topic_words in topics.items():
        words = [word[0] for word in topic_words]
        word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
        topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
            1, -1
        )
        topic_words = mmr(
            topic_embedding,
            word_embeddings,
            words,
            self.diversity,
            self.top_n_words,
        )
        updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
    return updated_topics