MaximalMarginalRelevance
¶
Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document.
MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
diversity |
float |
How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse. |
0.1 |
top_n_words |
int |
The number of keywords/keyhprases to return |
10 |
Usage:
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic
# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.3)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
Source code in bertopic\representation\_mmr.py
class MaximalMarginalRelevance(BaseRepresentation):
"""Calculate Maximal Marginal Relevance (MMR)
between candidate keywords and the document.
MMR considers the similarity of keywords/keyphrases with the
document, along with the similarity of already selected
keywords and keyphrases. This results in a selection of keywords
that maximize their within diversity with respect to the document.
Arguments:
diversity: How diverse the select keywords/keyphrases are.
Values range between 0 and 1 with 0 being not diverse at all
and 1 being most diverse.
top_n_words: The number of keywords/keyhprases to return
Usage:
```python
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic
# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.3)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
"""
def __init__(self, diversity: float = 0.1, top_n_words: int = 10):
self.diversity = diversity
self.top_n_words = top_n_words
def extract_topics(
self,
topic_model,
documents: pd.DataFrame,
c_tf_idf: csr_matrix,
topics: Mapping[str, List[Tuple[str, float]]],
) -> Mapping[str, List[Tuple[str, float]]]:
"""Extract topic representations.
Arguments:
topic_model: The BERTopic model
documents: Not used
c_tf_idf: Not used
topics: The candidate topics as calculated with c-TF-IDF
Returns:
updated_topics: Updated topic representations
"""
if topic_model.embedding_model is None:
warnings.warn(
"MaximalMarginalRelevance can only be used BERTopic was instantiated"
"with the `embedding_model` parameter."
)
return topics
updated_topics = {}
for topic, topic_words in topics.items():
words = [word[0] for word in topic_words]
word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
1, -1
)
topic_words = mmr(
topic_embedding,
word_embeddings,
words,
self.diversity,
self.top_n_words,
)
updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
return updated_topics
extract_topics(self, topic_model, documents, c_tf_idf, topics)
¶
Extract topic representations.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
topic_model |
The BERTopic model |
required | |
documents |
DataFrame |
Not used |
required |
c_tf_idf |
csr_matrix |
Not used |
required |
topics |
Mapping[str, List[Tuple[str, float]]] |
The candidate topics as calculated with c-TF-IDF |
required |
Returns:
Type | Description |
---|---|
updated_topics |
Updated topic representations |
Source code in bertopic\representation\_mmr.py
def extract_topics(
self,
topic_model,
documents: pd.DataFrame,
c_tf_idf: csr_matrix,
topics: Mapping[str, List[Tuple[str, float]]],
) -> Mapping[str, List[Tuple[str, float]]]:
"""Extract topic representations.
Arguments:
topic_model: The BERTopic model
documents: Not used
c_tf_idf: Not used
topics: The candidate topics as calculated with c-TF-IDF
Returns:
updated_topics: Updated topic representations
"""
if topic_model.embedding_model is None:
warnings.warn(
"MaximalMarginalRelevance can only be used BERTopic was instantiated"
"with the `embedding_model` parameter."
)
return topics
updated_topics = {}
for topic, topic_words in topics.items():
words = [word[0] for word in topic_words]
word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
1, -1
)
topic_words = mmr(
topic_embedding,
word_embeddings,
words,
self.diversity,
self.top_n_words,
)
updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
return updated_topics