Collection

class Collection()

A Collection object for PGVector.

Attributes:

  • client - The PGVector client.
  • collection_name str - The name of the collection. Default is “documents”.
  • embedding_function Callable - The embedding function used to generate the vector representation. Default is None. SentenceTransformer(“all-MiniLM-L6-v2”).encode will be used when None. Models can be chosen from: https://huggingface.co/models?library=sentence-transformers
  • metadata Optional[dict] - The metadata of the collection.
  • get_or_create Optional - The flag indicating whether to get or create the collection.

__init__

def __init__(client=None,
             collection_name: str = "autogen-docs",
             embedding_function: Callable = None,
             metadata=None,
             get_or_create=None)

Initialize the Collection object.

Arguments:

  • client - The PostgreSQL client.
  • collection_name - The name of the collection. Default is “documents”.
  • embedding_function - The embedding function used to generate the vector representation.
  • metadata - The metadata of the collection.
  • get_or_create - The flag indicating whether to get or create the collection.

Returns:

None

add

def add(ids: list[ItemID],
        documents: list,
        embeddings: list = None,
        metadatas: list = None) -> None

Add documents to the collection.

Arguments:

  • ids List[ItemID] - A list of document IDs.
  • embeddings List - A list of document embeddings. Optional
  • metadatas List - A list of document metadatas. Optional
  • documents List - A list of documents.

Returns:

None

upsert

def upsert(ids: list[ItemID],
           documents: list,
           embeddings: list = None,
           metadatas: list = None) -> None

Upsert documents into the collection.

Arguments:

  • ids List[ItemID] - A list of document IDs.
  • documents List - A list of documents.
  • embeddings List - A list of document embeddings.
  • metadatas List - A list of document metadatas.

Returns:

None

count

def count() -> int

Get the total number of documents in the collection.

Returns:

  • int - The total number of documents.

table_exists

def table_exists(table_name: str) -> bool

Check if a table exists in the PostgreSQL database.

Arguments:

  • table_name str - The name of the table to check.

Returns:

  • bool - True if the table exists, False otherwise.

get

def get(ids: Optional[str] = None,
        include: Optional[str] = None,
        where: Optional[str] = None,
        limit: Optional[Union[int, str]] = None,
        offset: Optional[Union[int, str]] = None) -> list[Document]

Retrieve documents from the collection.

Arguments:

  • ids Optional[List] - A list of document IDs.
  • include Optional - The fields to include.
  • where Optional - Additional filtering criteria.
  • limit Optional - The maximum number of documents to retrieve.
  • offset Optional - The offset for pagination.

Returns:

  • List - The retrieved documents.

update

def update(ids: list, embeddings: list, metadatas: list,
           documents: list) -> None

Update documents in the collection.

Arguments:

  • ids List - A list of document IDs.
  • embeddings List - A list of document embeddings.
  • metadatas List - A list of document metadatas.
  • documents List - A list of documents.

Returns:

None

euclidean_distance

@staticmethod
def euclidean_distance(arr1: list[float], arr2: list[float]) -> float

Calculate the Euclidean distance between two vectors.

Arguments:

  • arr1 (List[float]): The first vector.
  • arr2 (List[float]): The second vector.

Returns:

  • float: The Euclidean distance between arr1 and arr2.

cosine_distance

@staticmethod
def cosine_distance(arr1: list[float], arr2: list[float]) -> float

Calculate the cosine distance between two vectors.

Arguments:

  • arr1 (List[float]): The first vector.
  • arr2 (List[float]): The second vector.

Returns:

  • float: The cosine distance between arr1 and arr2.

inner_product_distance

@staticmethod
def inner_product_distance(arr1: list[float], arr2: list[float]) -> float

Calculate the Euclidean distance between two vectors.

Arguments:

  • arr1 (List[float]): The first vector.
  • arr2 (List[float]): The second vector.

Returns:

  • float: The Euclidean distance between arr1 and arr2.

query

def query(query_texts: list[str],
          collection_name: Optional[str] = None,
          n_results: Optional[int] = 10,
          distance_type: Optional[str] = "euclidean",
          distance_threshold: Optional[float] = -1,
          include_embedding: Optional[bool] = False) -> QueryResults

Query documents in the collection.

Arguments:

  • query_texts List[str] - A list of query texts.
  • collection_name Optional[str] - The name of the collection.
  • n_results int - The maximum number of results to return.
  • distance_type Optional[str] - Distance search type - euclidean or cosine
  • distance_threshold Optional[float] - Distance threshold to limit searches
  • include_embedding Optional[bool] - Include embedding values in QueryResults

Returns:

  • QueryResults - The query results.

convert_string_to_array

@staticmethod
def convert_string_to_array(array_string: str) -> list[float]

Convert a string representation of an array to a list of floats.

Arguments:

  • array_string (str): The string representation of the array.

Returns:

  • list: A list of floats parsed from the input string. If the input is not a string, it returns the input itself.

modify

def modify(metadata, collection_name: Optional[str] = None) -> None

Modify metadata for the collection.

Arguments:

  • collection_name - The name of the collection.
  • metadata - The new metadata.

Returns:

None

delete

def delete(ids: list[ItemID], collection_name: Optional[str] = None) -> None

Delete documents from the collection.

Arguments:

  • ids List[ItemID] - A list of document IDs to delete.
  • collection_name str - The name of the collection to delete.

Returns:

None

delete_collection

def delete_collection(collection_name: Optional[str] = None) -> None

Delete the entire collection.

Arguments:

  • collection_name Optional[str] - The name of the collection to delete.

Returns:

None

create_collection

def create_collection(collection_name: Optional[str] = None,
                      dimension: Optional[Union[str, int]] = None) -> None

Create a new collection.

Arguments:

  • collection_name Optional[str] - The name of the new collection.
  • dimension Optional[Union[str, int]] - The dimension size of the sentence embedding model

Returns:

None

PGVectorDB

class PGVectorDB(VectorDB)

A vector database that uses PGVector as the backend.

__init__

def __init__(*,
             conn: Optional[psycopg.Connection] = None,
             connection_string: Optional[str] = None,
             host: Optional[str] = None,
             port: Optional[Union[int, str]] = None,
             dbname: Optional[str] = None,
             username: Optional[str] = None,
             password: Optional[str] = None,
             connect_timeout: Optional[int] = 10,
             embedding_function: Callable = None,
             metadata: Optional[dict] = None) -> None

Initialize the vector database.

Note: connection_string or host + port + dbname must be specified

Arguments:

  • conn - psycopg.Connection | A customer connection object to connect to the database. A connection object may include additional key/values: https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING
  • connection_string - “postgresql://username:password@hostname:port/database” | The PGVector connection string. Default is None.
  • host - str | The host to connect to. Default is None.
  • port - int | The port to connect to. Default is None.
  • dbname - str | The database name to connect to. Default is None.
  • username - str | The database username to use. Default is None.
  • password - str | The database user password to use. Default is None.
  • connect_timeout - int | The timeout to set for the connection. Default is 10.
  • embedding_function - Callable | The embedding function used to generate the vector representation. Default is None. SentenceTransformer(“all-MiniLM-L6-v2”).encode will be used when None. Models can be chosen from: https://huggingface.co/models?library=sentence-transformers
  • metadata - dict | The metadata of the vector database. Default is None. If None, it will use this
  • setting - {“hnsw:space”: “ip”, “hnsw:construction_ef”: 30, “hnsw:M”: 16}. Creates Index on table using hnsw (embedding vector_l2_ops) WITH (m = hnsw:M) ef_construction = “hnsw:construction_ef”. For more info: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw

Returns:

None

establish_connection

def establish_connection(
        conn: Optional[psycopg.Connection] = None,
        connection_string: Optional[str] = None,
        host: Optional[str] = None,
        port: Optional[Union[int, str]] = None,
        dbname: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[str] = None,
        connect_timeout: Optional[int] = 10) -> psycopg.Connection

Establishes a connection to a PostgreSQL database using psycopg.

Arguments:

  • conn - An existing psycopg connection object. If provided, this connection will be used.
  • connection_string - A string containing the connection information. If provided, a new connection will be established using this string.
  • host - The hostname of the PostgreSQL server. Used if connection_string is not provided.
  • port - The port number to connect to at the server host. Used if connection_string is not provided.
  • dbname - The database name. Used if connection_string is not provided.
  • username - The username to connect as. Used if connection_string is not provided.
  • password - The user’s password. Used if connection_string is not provided.
  • connect_timeout - Maximum wait for connection, in seconds. The default is 10 seconds.

Returns:

A psycopg.Connection object representing the established connection.

Raises:

PermissionError if no credentials are supplied

  • psycopg.Error - If an error occurs while trying to connect to the database.

create_collection

def create_collection(collection_name: str,
                      overwrite: bool = False,
                      get_or_create: bool = True) -> Collection

Create a collection in the vector database. Case 1. if the collection does not exist, create the collection. Case 2. the collection exists, if overwrite is True, it will overwrite the collection. Case 3. the collection exists and overwrite is False, if get_or_create is True, it will get the collection, otherwise it raise a ValueError.

Arguments:

  • collection_name - str | The name of the collection.
  • overwrite - bool | Whether to overwrite the collection if it exists. Default is False.
  • get_or_create - bool | Whether to get the collection if it exists. Default is True.

Returns:

Collection | The collection object.

get_collection

def get_collection(collection_name: str = None) -> Collection

Get the collection from the vector database.

Arguments:

  • collection_name - str | The name of the collection. Default is None. If None, return the current active collection.

Returns:

Collection | The collection object.

delete_collection

def delete_collection(collection_name: str) -> None

Delete the collection from the vector database.

Arguments:

  • collection_name - str | The name of the collection.

Returns:

None

insert_docs

def insert_docs(docs: list[Document],
                collection_name: str = None,
                upsert: bool = False) -> None

Insert documents into the collection of the vector database.

Arguments:

  • docs - List[Document] | A list of documents. Each document is a TypedDict Document.
  • collection_name - str | The name of the collection. Default is None.
  • upsert - bool | Whether to update the document if it exists. Default is False.
  • kwargs - Dict | Additional keyword arguments.

Returns:

None

update_docs

def update_docs(docs: list[Document], collection_name: str = None) -> None

Update documents in the collection of the vector database.

Arguments:

  • docs - List[Document] | A list of documents.
  • collection_name - str | The name of the collection. Default is None.

Returns:

None

delete_docs

def delete_docs(ids: list[ItemID], collection_name: str = None) -> None

Delete documents from the collection of the vector database.

Arguments:

  • ids - List[ItemID] | A list of document ids. Each id is a typed ItemID.
  • collection_name - str | The name of the collection. Default is None.
  • kwargs - Dict | Additional keyword arguments.

Returns:

None

retrieve_docs

def retrieve_docs(queries: list[str],
                  collection_name: str = None,
                  n_results: int = 10,
                  distance_threshold: float = -1) -> QueryResults

Retrieve documents from the collection of the vector database based on the queries.

Arguments:

  • queries - List[str] | A list of queries. Each query is a string.
  • collection_name - str | The name of the collection. Default is None.
  • n_results - int | The number of relevant documents to return. Default is 10.
  • distance_threshold - float | The threshold for the distance score, only distance smaller than it will be returned. Don’t filter with it if < 0. Default is -1.
  • kwargs - Dict | Additional keyword arguments.

Returns:

QueryResults | The query results. Each query result is a list of list of tuples containing the document and the distance.

get_docs_by_ids

def get_docs_by_ids(ids: list[ItemID] = None,
                    collection_name: str = None,
                    include=None,
                    **kwargs) -> list[Document]

Retrieve documents from the collection of the vector database based on the ids.

Arguments:

  • ids - List[ItemID] | A list of document ids. If None, will return all the documents. Default is None.
  • collection_name - str | The name of the collection. Default is None.
  • include - List[str] | The fields to include. Default is None. If None, will include [“metadatas”, “documents”], ids will always be included.
  • kwargs - dict | Additional keyword arguments.

Returns:

List[Document] | The results.