OnlineCountVectorizer
¶
An online variant of the CountVectorizer with updating vocabulary.
At each .partial_fit
, its vocabulary is updated based on any OOV words
it might find. Then, .update_bow
can be used to track and update
the Bag-of-Words representation. These functions are separated such that
the vectorizer can be used in iteration without updating the Bag-of-Words
representation can might speed up the fitting process. However, the
.update_bow
function is used in BERTopic to track changes in the
topic representations and allow for decay.
This class inherits its parameters and attributes from:
sklearn.feature_extraction.text.CountVectorizer
Parameters:
Name | Type | Description | Default |
---|---|---|---|
decay |
float |
A value between [0, 1] to weight the percentage of frequencies
the previous bag-of-words should be decreased. For example,
a value of |
None |
delete_min_df |
float |
Delete words at each iteration from its vocabulary
that are below a minimum frequency.
This will keep the resulting bag-of-words matrix small
such that it does not explode in size with increasing
vocabulary. If |
None |
**kwargs |
Set of parameters inherited from:
|
{} |
Attributes:
Name | Type | Description |
---|---|---|
X_ |
scipy.sparse.csr_matrix) |
The Bag-of-Words representation |
Examples:
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer = OnlineCountVectorizer(stop_words="english")
for index, doc in enumerate(my_docs):
vectorizer.partial_fit(doc)
# Update and clean the bow every 100 iterations:
if index % 100 == 0:
X = vectorizer.update_bow()
To use the model in BERTopic:
from bertopic import BERTopic
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer_model = OnlineCountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
References
Adapted from: https://github.com/idoshlomo/online_vectorizers
Source code in bertopic\vectorizers\_online_cv.py
class OnlineCountVectorizer(CountVectorizer):
"""An online variant of the CountVectorizer with updating vocabulary.
At each `.partial_fit`, its vocabulary is updated based on any OOV words
it might find. Then, `.update_bow` can be used to track and update
the Bag-of-Words representation. These functions are separated such that
the vectorizer can be used in iteration without updating the Bag-of-Words
representation can might speed up the fitting process. However, the
`.update_bow` function is used in BERTopic to track changes in the
topic representations and allow for decay.
This class inherits its parameters and attributes from:
`sklearn.feature_extraction.text.CountVectorizer`
Arguments:
decay: A value between [0, 1] to weight the percentage of frequencies
the previous bag-of-words should be decreased. For example,
a value of `.1` will decrease the frequencies in the bag-of-words
matrix with 10% at each iteration.
delete_min_df: Delete words at each iteration from its vocabulary
that are below a minimum frequency.
This will keep the resulting bag-of-words matrix small
such that it does not explode in size with increasing
vocabulary. If `decay` is None then this equals `min_df`.
**kwargs: Set of parameters inherited from:
`sklearn.feature_extraction.text.CountVectorizer`
In practice, this means that you can still use parameters
from the original CountVectorizer, like `stop_words` and
`ngram_range`.
Attributes:
X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation
Examples:
```python
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer = OnlineCountVectorizer(stop_words="english")
for index, doc in enumerate(my_docs):
vectorizer.partial_fit(doc)
# Update and clean the bow every 100 iterations:
if index % 100 == 0:
X = vectorizer.update_bow()
```
To use the model in BERTopic:
```python
from bertopic import BERTopic
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer_model = OnlineCountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
```
References:
Adapted from: https://github.com/idoshlomo/online_vectorizers
"""
def __init__(self, decay: float = None, delete_min_df: float = None, **kwargs):
self.decay = decay
self.delete_min_df = delete_min_df
super(OnlineCountVectorizer, self).__init__(**kwargs)
def partial_fit(self, raw_documents: List[str]) -> None:
"""Perform a partial fit and update vocabulary with OOV tokens.
Arguments:
raw_documents: A list of documents
"""
if not hasattr(self, "vocabulary_"):
return self.fit(raw_documents)
analyzer = self.build_analyzer()
analyzed_documents = [analyzer(doc) for doc in raw_documents]
new_tokens = set(chain.from_iterable(analyzed_documents))
oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))
if oov_tokens:
max_index = max(self.vocabulary_.values())
oov_vocabulary = dict(
zip(
oov_tokens,
list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),
)
)
self.vocabulary_.update(oov_vocabulary)
return self
def update_bow(self, raw_documents: List[str]) -> csr_matrix:
"""Create or update the bag-of-words matrix.
Update the bag-of-words matrix by adding the newly transformed
documents. This may add empty columns if new words are found and/or
add empty rows if new topics are found.
During this process, the previous bag-of-words matrix might be
decayed if `self.decay` has been set during init. Similarly, words
that do not exceed `self.delete_min_df` are removed from its
vocabulary and bag-of-words matrix.
Arguments:
raw_documents: A list of documents
Returns:
X_: Bag-of-words matrix
"""
if hasattr(self, "X_"):
X = self.transform(raw_documents)
# Add empty columns if new words are found
columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
self.X_ = sparse.hstack([self.X_, columns])
# Add empty rows if new topics are found
rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
self.X_ = sparse.vstack([self.X_, rows])
# Decay of BoW matrix
if self.decay is not None:
self.X_ = self.X_ * (1 - self.decay)
self.X_ += X
else:
self.X_ = self.transform(raw_documents)
if self.delete_min_df is not None:
self._clean_bow()
return self.X_
def _clean_bow(self) -> None:
"""Remove words that do not exceed `self.delete_min_df`."""
# Only keep words with a minimum frequency
indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
indices_dict = {index: index for index in indices}
self.X_ = self.X_[:, indices]
# Update vocabulary with new words
new_vocab = {}
vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
for i, index in enumerate(indices):
if indices_dict.get(index) is not None:
new_vocab[vocabulary_dict[index]] = i
self.vocabulary_ = new_vocab
partial_fit(self, raw_documents)
¶
Perform a partial fit and update vocabulary with OOV tokens.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
raw_documents |
List[str] |
A list of documents |
required |
Source code in bertopic\vectorizers\_online_cv.py
def partial_fit(self, raw_documents: List[str]) -> None:
"""Perform a partial fit and update vocabulary with OOV tokens.
Arguments:
raw_documents: A list of documents
"""
if not hasattr(self, "vocabulary_"):
return self.fit(raw_documents)
analyzer = self.build_analyzer()
analyzed_documents = [analyzer(doc) for doc in raw_documents]
new_tokens = set(chain.from_iterable(analyzed_documents))
oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))
if oov_tokens:
max_index = max(self.vocabulary_.values())
oov_vocabulary = dict(
zip(
oov_tokens,
list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),
)
)
self.vocabulary_.update(oov_vocabulary)
return self
update_bow(self, raw_documents)
¶
Create or update the bag-of-words matrix.
Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or add empty rows if new topics are found.
During this process, the previous bag-of-words matrix might be
decayed if self.decay
has been set during init. Similarly, words
that do not exceed self.delete_min_df
are removed from its
vocabulary and bag-of-words matrix.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
raw_documents |
List[str] |
A list of documents |
required |
Returns:
Type | Description |
---|---|
X_ |
Bag-of-words matrix |
Source code in bertopic\vectorizers\_online_cv.py
def update_bow(self, raw_documents: List[str]) -> csr_matrix:
"""Create or update the bag-of-words matrix.
Update the bag-of-words matrix by adding the newly transformed
documents. This may add empty columns if new words are found and/or
add empty rows if new topics are found.
During this process, the previous bag-of-words matrix might be
decayed if `self.decay` has been set during init. Similarly, words
that do not exceed `self.delete_min_df` are removed from its
vocabulary and bag-of-words matrix.
Arguments:
raw_documents: A list of documents
Returns:
X_: Bag-of-words matrix
"""
if hasattr(self, "X_"):
X = self.transform(raw_documents)
# Add empty columns if new words are found
columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
self.X_ = sparse.hstack([self.X_, columns])
# Add empty rows if new topics are found
rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
self.X_ = sparse.vstack([self.X_, rows])
# Decay of BoW matrix
if self.decay is not None:
self.X_ = self.X_ * (1 - self.decay)
self.X_ += X
else:
self.X_ = self.transform(raw_documents)
if self.delete_min_df is not None:
self._clean_bow()
return self.X_