`Backends`¶

`BaseEmbedder` ¶

The Base Embedder used for creating embedding models.

Parameters:

Name	Type	Description	Default
`embedding_model`		The main embedding model to be used for extracting document and word embedding	`None`
`word_embedding_model`		The embedding model used for extracting word embeddings only. If this model is selected, then the `embedding_model` is purely used for creating document embeddings.	`None`

Source code in bertopic\backend\_base.py

class BaseEmbedder:
    """The Base Embedder used for creating embedding models.

    Arguments:
        embedding_model: The main embedding model to be used for extracting
                         document and word embedding
        word_embedding_model: The embedding model used for extracting word
                              embeddings only. If this model is selected,
                              then the `embedding_model` is purely used for
                              creating document embeddings.
    """

    def __init__(self, embedding_model=None, word_embedding_model=None):
        self.embedding_model = embedding_model
        self.word_embedding_model = word_embedding_model

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        pass

    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n words into an n-dimensional
        matrix of embeddings.

        Arguments:
            words: A list of words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Word embeddings with shape (n, m) with `n` words
            that each have an embeddings size of `m`

        """
        return self.embed(words, verbose)

    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n words into an n-dimensional
        matrix of embeddings.

        Arguments:
            document: A list of documents to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document embeddings with shape (n, m) with `n` documents
            that each have an embeddings size of `m`
        """
        return self.embed(document, verbose)

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_base.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    pass

`embed_documents(document, verbose=False)` ¶

Embed a list of n words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`document`	`List[str]`	A list of documents to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document embeddings with shape (n, m) with `n` documents
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_base.py

def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n words into an n-dimensional
    matrix of embeddings.

    Arguments:
        document: A list of documents to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document embeddings with shape (n, m) with `n` documents
        that each have an embeddings size of `m`
    """
    return self.embed(document, verbose)

`embed_words(words, verbose=False)` ¶

Embed a list of n words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`words`	`List[str]`	A list of words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Word embeddings with shape (n, m) with `n` words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_base.py

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n words into an n-dimensional
    matrix of embeddings.

    Arguments:
        words: A list of words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Word embeddings with shape (n, m) with `n` words
        that each have an embeddings size of `m`

    """
    return self.embed(words, verbose)

`CohereBackend` ¶

Bases: BaseEmbedder

Cohere Embedding Model.

Parameters:

Name	Type	Description	Default
`client`		A `cohere` client.	required
`embedding_model`	`str`	A Cohere model. Default is "large". For an overview of models see: https://docs.cohere.ai/docs/generation-card	`'large'`
`delay_in_seconds`	`float`	If a `batch_size` is given, use this set the delay in seconds between batches.	`None`
`batch_size`	`int`	The size of each batch.	`None`
`embed_kwargs`	`Mapping[str, Any]`	Kwargs passed to `cohere.Client.embed`. Can be used to define additional parameters such as `input_type`	`{}`

Examples:

import cohere
from bertopic.backend import CohereBackend

client = cohere.Client("APIKEY")
cohere_model = CohereBackend(client)

If you want to specify input_type:

cohere_model = CohereBackend(
    client,
    embedding_model="embed-english-v3.0",
    embed_kwargs={"input_type": "clustering"}
)

Source code in bertopic\backend\_cohere.py

class CohereBackend(BaseEmbedder):
    """Cohere Embedding Model.

    Arguments:
        client: A `cohere` client.
        embedding_model: A Cohere model. Default is "large".
                         For an overview of models see:
                         https://docs.cohere.ai/docs/generation-card
        delay_in_seconds: If a `batch_size` is given, use this set
                          the delay in seconds between batches.
        batch_size: The size of each batch.
        embed_kwargs: Kwargs passed to `cohere.Client.embed`.
                            Can be used to define additional parameters
                            such as `input_type`

    Examples:
    ```python
    import cohere
    from bertopic.backend import CohereBackend

    client = cohere.Client("APIKEY")
    cohere_model = CohereBackend(client)
    ```

    If you want to specify `input_type`:

    ```python
    cohere_model = CohereBackend(
        client,
        embedding_model="embed-english-v3.0",
        embed_kwargs={"input_type": "clustering"}
    )
    ```
    """

    def __init__(
        self,
        client,
        embedding_model: str = "large",
        delay_in_seconds: float = None,
        batch_size: int = None,
        embed_kwargs: Mapping[str, Any] = {},
    ):
        super().__init__()
        self.client = client
        self.embedding_model = embedding_model
        self.delay_in_seconds = delay_in_seconds
        self.batch_size = batch_size
        self.embed_kwargs = embed_kwargs

        if self.embed_kwargs.get("model"):
            self.embedding_model = embed_kwargs.get("model")
        else:
            self.embed_kwargs["model"] = self.embedding_model

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        # Batch-wise embedding extraction
        if self.batch_size is not None:
            embeddings = []
            for batch in tqdm(self._chunks(documents), disable=not verbose):
                response = self.client.embed(texts=batch, **self.embed_kwargs)
                embeddings.extend(response.embeddings)

                # Delay subsequent calls
                if self.delay_in_seconds:
                    time.sleep(self.delay_in_seconds)

        # Extract embeddings all at once
        else:
            response = self.client.embed(texts=documents, **self.embed_kwargs)
            embeddings = response.embeddings
        return np.array(embeddings)

    def _chunks(self, documents):
        for i in range(0, len(documents), self.batch_size):
            yield documents[i : i + self.batch_size]

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_cohere.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Batch-wise embedding extraction
    if self.batch_size is not None:
        embeddings = []
        for batch in tqdm(self._chunks(documents), disable=not verbose):
            response = self.client.embed(texts=batch, **self.embed_kwargs)
            embeddings.extend(response.embeddings)

            # Delay subsequent calls
            if self.delay_in_seconds:
                time.sleep(self.delay_in_seconds)

    # Extract embeddings all at once
    else:
        response = self.client.embed(texts=documents, **self.embed_kwargs)
        embeddings = response.embeddings
    return np.array(embeddings)

`FastEmbedBackend` ¶

Bases: BaseEmbedder

FastEmbed embedding model.

The FastEmbed embedding model used for generating sentence embeddings.

Parameters:

Name	Type	Description	Default
`embedding_model`	`str`	A FastEmbed embedding model	`'BAAI/bge-small-en-v1.5'`

Examples: To create a model, you can load in a string pointing to a supported FastEmbed model:

from bertopic.backend import FastEmbedBackend

sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")

Source code in bertopic\backend\_fastembed.py

class FastEmbedBackend(BaseEmbedder):
    """FastEmbed embedding model.

    The FastEmbed embedding model used for generating sentence embeddings.

    Arguments:
        embedding_model: A FastEmbed embedding model

    Examples:
    To create a model, you can load in a string pointing to a supported
    FastEmbed model:

    ```python
    from bertopic.backend import FastEmbedBackend

    sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
    ```
    """

    def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
        super().__init__()

        supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]

        if isinstance(embedding_model, str) and embedding_model in supported_models:
            self.embedding_model = TextEmbedding(model_name=embedding_model)
        else:
            raise ValueError(
                "Please select a correct FasteEmbed model: \n"
                "the model must be a string and must be supported. \n"
                "The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
            )

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
        return embeddings

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_fastembed.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
    return embeddings

`LangChainBackend` ¶

Bases: BaseEmbedder

LangChain Embedding Model.

This class uses the LangChain Embedding class to embed the documents. Argument: embedding_model: A LangChain Embedding Instance.

Examples:

from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from bertopic.backend import LangChainBackend

hf_embedding = HuggingFaceInstructEmbeddings()
langchain_embedder = LangChainBackend(hf_embedding)

Source code in bertopic\backend\_langchain.py

class LangChainBackend(BaseEmbedder):
    """LangChain Embedding Model.

    This class uses the LangChain Embedding class to embed the documents.
    Argument:
        embedding_model: A LangChain Embedding Instance.

    Examples:
    ```python
    from langchain_community.embeddings import HuggingFaceInstructEmbeddings
    from bertopic.backend import LangChainBackend

    hf_embedding = HuggingFaceInstructEmbeddings()
    langchain_embedder = LangChainBackend(hf_embedding)
    ```
    """

    def __init__(self, embedding_model: Embeddings):
        self.embedding_model = embedding_model

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        # Prepare documents, replacing empty strings with a single space
        prepared_documents = [" " if doc == "" else doc for doc in documents]
        response = self.embedding_model.embed_documents(prepared_documents)
        return np.array(response)

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_langchain.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Prepare documents, replacing empty strings with a single space
    prepared_documents = [" " if doc == "" else doc for doc in documents]
    response = self.embedding_model.embed_documents(prepared_documents)
    return np.array(response)

`Model2VecBackend` ¶

Bases: BaseEmbedder

Model2Vec embedding model.

Parameters:

Name	Type	Description	Default
`embedding_model`	`Union[str, StaticModel]`	Either a model2vec model or a string pointing to a model2vec model	required
`distill`	`bool`	Indicates whether to distill a sentence-transformers compatible model. The distillation will happen during fitting of the topic model. NOTE: Only works if `embedding_model` is a string.	`False`
`distill_kwargs`	`dict`	Keyword arguments to pass to the distillation process of `model2vec.distill.distill`	`{}`
`distill_vectorizer`	`str`	A CountVectorizer used for creating a custom vocabulary based on the same documents used for topic modeling. NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored.	`None`

Examples: To create a model, you can load in a string pointing to a model2vec model:

from bertopic.backend import Model2VecBackend

sentence_model = Model2VecBackend("minishlab/potion-base-8M")

or you can instantiate a model yourself:

from bertopic.backend import Model2VecBackend
from model2vec import StaticModel

embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
sentence_model = Model2VecBackend(embedding_model)

If you want to distill a sentence-transformers model with the vocabulary of the documents, run the following:

from bertopic.backend import Model2VecBackend

sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True)

Source code in bertopic\backend\_model2vec.py

class Model2VecBackend(BaseEmbedder):
    """Model2Vec embedding model.

    Arguments:
        embedding_model: Either a model2vec model or a
                         string pointing to a model2vec model
        distill: Indicates whether to distill a sentence-transformers compatible model.
                 The distillation will happen during fitting of the topic model.
                 NOTE: Only works if `embedding_model` is a string.
        distill_kwargs: Keyword arguments to pass to the distillation process
                        of `model2vec.distill.distill`
        distill_vectorizer: A CountVectorizer used for creating a custom vocabulary
                            based on the same documents used for topic modeling.
                            NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored.

    Examples:
    To create a model, you can load in a string pointing to a
    model2vec model:

    ```python
    from bertopic.backend import Model2VecBackend

    sentence_model = Model2VecBackend("minishlab/potion-base-8M")
    ```

    or  you can instantiate a model yourself:

    ```python
    from bertopic.backend import Model2VecBackend
    from model2vec import StaticModel

    embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
    sentence_model = Model2VecBackend(embedding_model)
    ```

    If you want to distill a sentence-transformers model with the vocabulary of the documents,
    run the following:

    ```python
    from bertopic.backend import Model2VecBackend

    sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True)
    ```
    """

    def __init__(
        self,
        embedding_model: Union[str, StaticModel],
        distill: bool = False,
        distill_kwargs: dict = {},
        distill_vectorizer: str = None,
    ):
        super().__init__()

        self.distill = distill
        self.distill_kwargs = distill_kwargs
        self.distill_vectorizer = distill_vectorizer
        self._has_distilled = False

        # When we distill, we need a string pointing to a sentence-transformer model
        if self.distill:
            self._check_model2vec_installation()
            if not self.distill_vectorizer:
                self.distill_vectorizer = CountVectorizer()
            if isinstance(embedding_model, str):
                self.embedding_model = embedding_model
            else:
                raise ValueError("Please pass a string pointing to a sentence-transformer model when distilling.")

        # If we don't distill, we can pass a model2vec model directly or load from a string
        elif isinstance(embedding_model, StaticModel):
            self.embedding_model = embedding_model
        elif isinstance(embedding_model, str):
            self.embedding_model = StaticModel.from_pretrained(embedding_model)
        else:
            raise ValueError(
                "Please select a correct Model2Vec model: \n"
                "`from model2vec import StaticModel` \n"
                "`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`"
            )

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        # Distill the model
        if self.distill and not self._has_distilled:
            from model2vec.distill import distill

            # Distill with the vocabulary of the documents
            if not self.distill_kwargs.get("vocabulary"):
                X = self.distill_vectorizer.fit_transform(documents)
                word_counts = np.array(X.sum(axis=0)).flatten()
                words = self.distill_vectorizer.get_feature_names_out()
                vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
                self.distill_kwargs["vocabulary"] = vocabulary

            # Distill the model
            self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)

            # Distillation should happen only once and not for every embed call
            # The distillation should only happen the first time on the entire vocabulary
            self._has_distilled = True

        # Embed the documents
        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
        return embeddings

    def _check_model2vec_installation(self):
        try:
            from model2vec.distill import distill  # noqa: F401
        except ImportError:
            raise ImportError("To distill a model using model2vec, you need to run `pip install model2vec[distill]`")

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_model2vec.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Distill the model
    if self.distill and not self._has_distilled:
        from model2vec.distill import distill

        # Distill with the vocabulary of the documents
        if not self.distill_kwargs.get("vocabulary"):
            X = self.distill_vectorizer.fit_transform(documents)
            word_counts = np.array(X.sum(axis=0)).flatten()
            words = self.distill_vectorizer.get_feature_names_out()
            vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
            self.distill_kwargs["vocabulary"] = vocabulary

        # Distill the model
        self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)

        # Distillation should happen only once and not for every embed call
        # The distillation should only happen the first time on the entire vocabulary
        self._has_distilled = True

    # Embed the documents
    embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
    return embeddings

`MultiModalBackend` ¶

Bases: BaseEmbedder

Multimodal backend using Sentence-transformers.

The sentence-transformers embedding model used for generating word, document, and image embeddings.

Parameters:

Name	Type	Description	Default
`embedding_model`	`Union[str, SentenceTransformer]`	A sentence-transformers embedding model that can either embed both images and text or only text. If it only embeds text, then `image_model` needs to be used to embed the images.	required
`image_model`	`Union[str, SentenceTransformer]`	A sentence-transformers embedding model that is used to embed only images.	`None`
`batch_size`	`int`	The sizes of image batches to pass	`32`

Examples: To create a model, you can load in a string pointing to a sentence-transformers model:

from bertopic.backend import MultiModalBackend

sentence_model = MultiModalBackend("clip-ViT-B-32")

or you can instantiate a model yourself:

from bertopic.backend import MultiModalBackend
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("clip-ViT-B-32")
sentence_model = MultiModalBackend(embedding_model)

Source code in bertopic\backend\_multimodal.py

class MultiModalBackend(BaseEmbedder):
    """Multimodal backend using Sentence-transformers.

    The sentence-transformers embedding model used for
    generating word, document, and image embeddings.

    Arguments:
        embedding_model: A sentence-transformers embedding model that
                         can either embed both images and text or only text.
                         If it only embeds text, then `image_model` needs
                         to be used to embed the images.
        image_model: A sentence-transformers embedding model that is used
                     to embed only images.
        batch_size: The sizes of image batches to pass

    Examples:
    To create a model, you can load in a string pointing to a
    sentence-transformers model:

    ```python
    from bertopic.backend import MultiModalBackend

    sentence_model = MultiModalBackend("clip-ViT-B-32")
    ```

    or  you can instantiate a model yourself:
    ```python
    from bertopic.backend import MultiModalBackend
    from sentence_transformers import SentenceTransformer

    embedding_model = SentenceTransformer("clip-ViT-B-32")
    sentence_model = MultiModalBackend(embedding_model)
    ```
    """

    def __init__(
        self,
        embedding_model: Union[str, SentenceTransformer],
        image_model: Union[str, SentenceTransformer] = None,
        batch_size: int = 32,
    ):
        super().__init__()
        self.batch_size = batch_size

        # Text or Text+Image model
        if isinstance(embedding_model, SentenceTransformer):
            self.embedding_model = embedding_model
        elif isinstance(embedding_model, str):
            self.embedding_model = SentenceTransformer(embedding_model)
        else:
            raise ValueError(
                "Please select a correct SentenceTransformers model: \n"
                "`from sentence_transformers import SentenceTransformer` \n"
                "`model = SentenceTransformer('clip-ViT-B-32')`"
            )

        # Image Model
        self.image_model = None
        if image_model is not None:
            if isinstance(image_model, SentenceTransformer):
                self.image_model = image_model
            elif isinstance(image_model, str):
                self.image_model = SentenceTransformer(image_model)
            else:
                raise ValueError(
                    "Please select a correct SentenceTransformers model: \n"
                    "`from sentence_transformers import SentenceTransformer` \n"
                    "`model = SentenceTransformer('clip-ViT-B-32')`"
                )

        try:
            self.tokenizer = self.embedding_model._first_module().processor.tokenizer
        except AttributeError:
            self.tokenizer = self.embedding_model.tokenizer
        except:  # noqa: E722
            self.tokenizer = None

    def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words or images into an n-dimensional
        matrix of embeddings.

        Either documents, images, or both can be provided. If both are provided,
        then the embeddings are averaged.

        Arguments:
            documents: A list of documents or words to be embedded
            images: A list of image paths to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        # Embed documents
        doc_embeddings = None
        if documents[0] is not None:
            doc_embeddings = self.embed_documents(documents)

        # Embed images
        image_embeddings = None
        if isinstance(images, list):
            image_embeddings = self.embed_images(images, verbose)

        # Average embeddings
        averaged_embeddings = None
        if doc_embeddings is not None and image_embeddings is not None:
            averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0)

        if averaged_embeddings is not None:
            return averaged_embeddings
        elif doc_embeddings is not None:
            return doc_embeddings
        elif image_embeddings is not None:
            return image_embeddings

    def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        truncated_docs = [self._truncate_document(doc) for doc in documents]
        embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
        return embeddings

    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n words into an n-dimensional
        matrix of embeddings.

        Arguments:
            words: A list of words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        embeddings = self.embedding_model.encode(words, show_progress_bar=verbose)
        return embeddings

    def embed_images(self, images, verbose):
        if self.batch_size:
            nr_iterations = int(np.ceil(len(images) / self.batch_size))

            # Embed images per batch
            embeddings = []
            for i in tqdm(range(nr_iterations), disable=not verbose):
                start_index = i * self.batch_size
                end_index = (i * self.batch_size) + self.batch_size

                images_to_embed = [
                    Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
                ]
                if self.image_model is not None:
                    img_emb = self.image_model.encode(images_to_embed)
                else:
                    img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
                embeddings.extend(img_emb.tolist())

                # Close images
                if isinstance(images[0], str):
                    for image in images_to_embed:
                        image.close()
            embeddings = np.array(embeddings)
        else:
            images_to_embed = [Image.open(filepath) for filepath in images]
            if self.image_model is not None:
                embeddings = self.image_model.encode(images_to_embed)
            else:
                embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
        return embeddings

    def _truncate_document(self, document):
        if self.tokenizer:
            tokens = self.tokenizer.encode(document)

            if len(tokens) > 77:
                # Skip the starting token, only include 75 tokens
                truncated_tokens = tokens[1:76]
                document = self.tokenizer.decode(truncated_tokens)

                # Recursive call here, because the encode(decode()) can have different result
                return self._truncate_document(document)

        return document

`embed(documents, images=None, verbose=False)` ¶

Embed a list of n documents/words or images into an n-dimensional matrix of embeddings.

Either documents, images, or both can be provided. If both are provided, then the embeddings are averaged.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`images`	`List[str]`	A list of image paths to be embedded	`None`
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_multimodal.py

def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words or images into an n-dimensional
    matrix of embeddings.

    Either documents, images, or both can be provided. If both are provided,
    then the embeddings are averaged.

    Arguments:
        documents: A list of documents or words to be embedded
        images: A list of image paths to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Embed documents
    doc_embeddings = None
    if documents[0] is not None:
        doc_embeddings = self.embed_documents(documents)

    # Embed images
    image_embeddings = None
    if isinstance(images, list):
        image_embeddings = self.embed_images(images, verbose)

    # Average embeddings
    averaged_embeddings = None
    if doc_embeddings is not None and image_embeddings is not None:
        averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0)

    if averaged_embeddings is not None:
        return averaged_embeddings
    elif doc_embeddings is not None:
        return doc_embeddings
    elif image_embeddings is not None:
        return image_embeddings

`embed_documents(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_multimodal.py

def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    truncated_docs = [self._truncate_document(doc) for doc in documents]
    embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
    return embeddings

`embed_words(words, verbose=False)` ¶

Embed a list of n words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`words`	`List[str]`	A list of words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_multimodal.py

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n words into an n-dimensional
    matrix of embeddings.

    Arguments:
        words: A list of words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    embeddings = self.embedding_model.encode(words, show_progress_bar=verbose)
    return embeddings

`OpenAIBackend` ¶

Bases: BaseEmbedder

OpenAI Embedding Model.

Parameters:

Name	Type	Description	Default
`client`	`OpenAI`	A `openai.OpenAI` client.	required
`embedding_model`	`str`	An OpenAI model. Default is For an overview of models see: https://platform.openai.com/docs/models/embeddings	`'text-embedding-ada-002'`
`delay_in_seconds`	`float`	If a `batch_size` is given, use this set the delay in seconds between batches.	`None`
`batch_size`	`int`	The size of each batch.	`None`
`generator_kwargs`	`Mapping[str, Any]`	Kwargs passed to `openai.Embedding.create`. Can be used to define custom engines or deployment_ids.	`{}`

Examples:

import openai
from bertopic.backend import OpenAIBackend

client = openai.OpenAI(api_key="sk-...")
openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")

Source code in bertopic\backend\_openai.py

class OpenAIBackend(BaseEmbedder):
    """OpenAI Embedding Model.

    Arguments:
        client: A `openai.OpenAI` client.
        embedding_model: An OpenAI model. Default is
                         For an overview of models see:
                         https://platform.openai.com/docs/models/embeddings
        delay_in_seconds: If a `batch_size` is given, use this set
                          the delay in seconds between batches.
        batch_size: The size of each batch.
        generator_kwargs: Kwargs passed to `openai.Embedding.create`.
                          Can be used to define custom engines or
                          deployment_ids.

    Examples:
    ```python
    import openai
    from bertopic.backend import OpenAIBackend

    client = openai.OpenAI(api_key="sk-...")
    openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
    ```
    """

    def __init__(
        self,
        client: openai.OpenAI,
        embedding_model: str = "text-embedding-ada-002",
        delay_in_seconds: float = None,
        batch_size: int = None,
        generator_kwargs: Mapping[str, Any] = {},
    ):
        super().__init__()
        self.client = client
        self.embedding_model = embedding_model
        self.delay_in_seconds = delay_in_seconds
        self.batch_size = batch_size
        self.generator_kwargs = generator_kwargs

        if self.generator_kwargs.get("model"):
            self.embedding_model = generator_kwargs.get("model")
        elif not self.generator_kwargs.get("engine"):
            self.generator_kwargs["model"] = self.embedding_model

    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n documents/words into an n-dimensional
        matrix of embeddings.

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        # Prepare documents, replacing empty strings with a single space
        prepared_documents = [" " if doc == "" else doc for doc in documents]

        # Batch-wise embedding extraction
        if self.batch_size is not None:
            embeddings = []
            for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
                response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
                embeddings.extend([r.embedding for r in response.data])

                # Delay subsequent calls
                if self.delay_in_seconds:
                    time.sleep(self.delay_in_seconds)

        # Extract embeddings all at once
        else:
            response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
            embeddings = [r.embedding for r in response.data]
        return np.array(embeddings)

    def _chunks(self, documents):
        for i in range(0, len(documents), self.batch_size):
            yield documents[i : i + self.batch_size]

`embed(documents, verbose=False)` ¶

Embed a list of n documents/words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`documents`	`List[str]`	A list of documents or words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document/words embeddings with shape (n, m) with `n` documents/words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_openai.py

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n documents/words into an n-dimensional
    matrix of embeddings.

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Prepare documents, replacing empty strings with a single space
    prepared_documents = [" " if doc == "" else doc for doc in documents]

    # Batch-wise embedding extraction
    if self.batch_size is not None:
        embeddings = []
        for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
            response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
            embeddings.extend([r.embedding for r in response.data])

            # Delay subsequent calls
            if self.delay_in_seconds:
                time.sleep(self.delay_in_seconds)

    # Extract embeddings all at once
    else:
        response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
        embeddings = [r.embedding for r in response.data]
    return np.array(embeddings)

`WordDocEmbedder` ¶

Bases: BaseEmbedder

Combine a document- and word-level embedder.

Source code in bertopic\backend\_word_doc.py

class WordDocEmbedder(BaseEmbedder):
    """Combine a document- and word-level embedder."""

    def __init__(self, embedding_model, word_embedding_model):
        super().__init__()

        self.embedding_model = select_backend(embedding_model)
        self.word_embedding_model = select_backend(word_embedding_model)

    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n words into an n-dimensional
        matrix of embeddings.

        Arguments:
            words: A list of words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Word embeddings with shape (n, m) with `n` words
            that each have an embeddings size of `m`

        """
        return self.word_embedding_model.embed(words, verbose)

    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
        """Embed a list of n words into an n-dimensional
        matrix of embeddings.

        Arguments:
            document: A list of documents to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document embeddings with shape (n, m) with `n` documents
            that each have an embeddings size of `m`
        """
        return self.embedding_model.embed(document, verbose)

`embed_documents(document, verbose=False)` ¶

Embed a list of n words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`document`	`List[str]`	A list of documents to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Document embeddings with shape (n, m) with `n` documents
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_word_doc.py

def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n words into an n-dimensional
    matrix of embeddings.

    Arguments:
        document: A list of documents to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document embeddings with shape (n, m) with `n` documents
        that each have an embeddings size of `m`
    """
    return self.embedding_model.embed(document, verbose)

`embed_words(words, verbose=False)` ¶

Embed a list of n words into an n-dimensional matrix of embeddings.

Parameters:

Name	Type	Description	Default
`words`	`List[str]`	A list of words to be embedded	required
`verbose`	`bool`	Controls the verbosity of the process	`False`

Returns:

Type	Description
`ndarray`	Word embeddings with shape (n, m) with `n` words
`ndarray`	that each have an embeddings size of `m`

Source code in bertopic\backend\_word_doc.py

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
    """Embed a list of n words into an n-dimensional
    matrix of embeddings.

    Arguments:
        words: A list of words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Word embeddings with shape (n, m) with `n` words
        that each have an embeddings size of `m`

    """
    return self.word_embedding_model.embed(words, verbose)

Backends¶

BaseEmbedder ¶

embed(documents, verbose=False) ¶

embed_documents(document, verbose=False) ¶

embed_words(words, verbose=False) ¶

CohereBackend ¶

embed(documents, verbose=False) ¶

FastEmbedBackend ¶

embed(documents, verbose=False) ¶

LangChainBackend ¶

embed(documents, verbose=False) ¶

Model2VecBackend ¶

embed(documents, verbose=False) ¶

MultiModalBackend ¶

embed(documents, images=None, verbose=False) ¶

embed_documents(documents, verbose=False) ¶

embed_words(words, verbose=False) ¶

OpenAIBackend ¶

embed(documents, verbose=False) ¶

WordDocEmbedder ¶

embed_documents(document, verbose=False) ¶

embed_words(words, verbose=False) ¶

`Backends`¶

`BaseEmbedder` ¶

`embed(documents, verbose=False)` ¶

`embed_documents(document, verbose=False)` ¶

`embed_words(words, verbose=False)` ¶

`CohereBackend` ¶

`embed(documents, verbose=False)` ¶

`FastEmbedBackend` ¶

`embed(documents, verbose=False)` ¶

`LangChainBackend` ¶

`embed(documents, verbose=False)` ¶

`Model2VecBackend` ¶

`embed(documents, verbose=False)` ¶

`MultiModalBackend` ¶

`embed(documents, images=None, verbose=False)` ¶

`embed_documents(documents, verbose=False)` ¶

`embed_words(words, verbose=False)` ¶

`OpenAIBackend` ¶

`embed(documents, verbose=False)` ¶

`WordDocEmbedder` ¶

`embed_documents(document, verbose=False)` ¶

`embed_words(words, verbose=False)` ¶