Chroma向量数据库chromadb

作者: DayDayUp_hhxx | 来源:发表于2023-07-25 16:13 被阅读0次

Chroma Mask for Mac v2.0.10特别版
学习小组Day6笔记--jam
学习小组Day 5--数据结构---ling
Picture Instruments Mask Integra
Milvus：Milvus2.0官方文档学习
单细胞笔记3-scATAC-seq介绍
scATAC建库
Max/MSP/Jitter 官方教程翻译13 - 列表和矩阵
Max/MSP/Jitter 官方教程翻译16 - 矩阵定位
Max/MSP/Jitter 官方教程翻译18 - 使用已命名的

1.安装

pip install chromadb

2.生成client

import chromadb
chroma_client = chromadb.Client()

3.创建collection

collection = chroma_client.create_collection(name="my_collection")

4.添加数据到collection
需要注意embeddings的维度保持一致，生成embedding的函数在定义collection的时候声明

collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

#添加含有embeddings 的数据
collection.add(
    embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

5.查询

results = collection.query(query_texts=["This is a query document"], n_results=2)

6.将数据保存在本地

client = chromadb.PersistentClient(path="/path/to/save/to")

7.使用客户端和服务端模式

使用docker
docker-compose up -d --build
#连接服务端
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

8.使用collections
如果collection创建的时候指定了embedding_function，那么再次读取的时候也需要指定embedding_function。
collection默认使用“all-MiniLM-L6-v2”模型。

collection = client.create_collection(name="my_collection", embedding_function=emb_fn)
collection = client.get_collection(name="my_collection", embedding_function=emb_fn)
#如果不存在则创建
collection = client.get_or_create_collection(name="test") 
#删除
client.delete_collection(name="my_collection") 
#返回前10条
collection.peek()
#计数
collection.count()
#重命名
collection.modify(name="new_name")

9.修改相似函数

 collection = client.create_collection(
        name="collection_name",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )
#hnsw:space 有 "l2", "ip ", "cosine"

image.png

10.添加数据
ids为必需参数；
documents和embeddings必有一个，如果传入documents则根据embedding_function生成，如果传入embeddings则要保证和collection中的维度一致；
metadatas为选填，主要用来描述数据属性，以后查询数据时可以作为筛选条件。

collection.add(
    documents=["doc1", "doc2", "doc3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    ids=["id1", "id2", "id3", ...]
)

11.查询数据
ids为必返回参数；
embeddings默认不返回；
distances、documents、metadatas 可以使用include指定；

collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)
#或者
collection.query(
    query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)
#或者
collection.get(
    ids=["id1", "id2", "id3", ...],
    include=["documents"]，
    where={"style": "style1"}
)

”where“ 过滤metadata

where = {"chapter":{"$eq":"3"}}

$eq - equal to (string, int, float)
$ne - not equal to (string, int, float)
$gt - greater than (int, float)
$gte - greater than or equal to (int, float)
$lt - less than (int, float)
$lte - less than or equal to (int, float)
#使用 "$and" ,"$or" 多条件过滤
collection.query(query_embeddings=[[1.2, 2.3, 4.5]],
                where={"$or":[{"source":{"$eq":"my_source"}},{"source":{"$eq":"my_source1"}}]})

”where_document“ 过滤 document

 where_document={"$contains":"search_string"}

12.更新数据
使用update，如果ids不存在则报错；
使用upsert，如果ids不存在则插入；
更新数据的时候，embeddings不存在则自动生成，embeddings传入需要注意维度一致。

collection.update(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)
#如不存在则插入
collection.upsert(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)

13.删除数据
使用ids或者where条件

collection.delete(
    ids=["id1", "id2", "id3",...],
    where={"chapter": "20"}
)

14.Embeddings

from chromadb.utils import embedding_functions

#默认使用all-MiniLM-L6-v2，维度为384
default_ef = embedding_functions.DefaultEmbeddingFunction()
default_ef ("this is test")

#Sentence Transformers
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

#OpenAI
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="YOUR_API_KEY",
                model_name="text-embedding-ada-002"
            )

#Cohere
#Instructor models
#Google PaLM API models
#自定义

参考：chroma官网https://docs.trychroma.com/