1.安装
pip install chromadb
2.生成client
import chromadb
chroma_client = chromadb.Client()
3.创建collection
collection = chroma_client.create_collection(name="my_collection")
4.添加数据到collection
需要注意embeddings的维度保持一致,生成embedding的函数在定义collection的时候声明
collection.add(
documents=["This is a document", "This is another document"],
metadatas=[{"source": "my_source"}, {"source": "my_source"}],
ids=["id1", "id2"]
)
#添加含有embeddings 的数据
collection.add(
embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
documents=["This is a document", "This is another document"],
metadatas=[{"source": "my_source"}, {"source": "my_source"}],
ids=["id1", "id2"]
)
5.查询
results = collection.query(query_texts=["This is a query document"], n_results=2)
6.将数据保存在本地
client = chromadb.PersistentClient(path="/path/to/save/to")
7.使用客户端和服务端模式
使用docker
docker-compose up -d --build
#连接服务端
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8000)
8.使用collections
如果collection创建的时候指定了embedding_function,那么再次读取的时候也需要指定embedding_function。
collection默认使用“all-MiniLM-L6-v2”模型。
collection = client.create_collection(name="my_collection", embedding_function=emb_fn)
collection = client.get_collection(name="my_collection", embedding_function=emb_fn)
#如果不存在则创建
collection = client.get_or_create_collection(name="test")
#删除
client.delete_collection(name="my_collection")
#返回前10条
collection.peek()
#计数
collection.count()
#重命名
collection.modify(name="new_name")
9.修改相似函数
collection = client.create_collection(
name="collection_name",
metadata={"hnsw:space": "cosine"} # l2 is the default
)
#hnsw:space 有 "l2", "ip ", "cosine"
image.png
10.添加数据
ids为必需参数;
documents和embeddings必有一个,如果传入documents则根据embedding_function生成,如果传入embeddings则要保证和collection中的维度一致;
metadatas为选填,主要用来描述 数据属性,以后查询数据时可以作为筛选条件。
collection.add(
documents=["doc1", "doc2", "doc3", ...],
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
ids=["id1", "id2", "id3", ...]
)
11.查询数据
ids为必返回参数;
embeddings默认不返回;
distances、documents、metadatas 可以使用include指定;
collection.query(
query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
n_results=10,
where={"metadata_field": "is_equal_to_this"},
where_document={"$contains":"search_string"}
)
#或者
collection.query(
query_texts=["doc10", "thus spake zarathustra", ...],
n_results=10,
where={"metadata_field": "is_equal_to_this"},
where_document={"$contains":"search_string"}
)
#或者
collection.get(
ids=["id1", "id2", "id3", ...],
include=["documents"],
where={"style": "style1"}
)
”where“ 过滤metadata
where = {"chapter":{"$eq":"3"}}
$eq - equal to (string, int, float)
$ne - not equal to (string, int, float)
$gt - greater than (int, float)
$gte - greater than or equal to (int, float)
$lt - less than (int, float)
$lte - less than or equal to (int, float)
#使用 "$and" ,"$or" 多条件过滤
collection.query(query_embeddings=[[1.2, 2.3, 4.5]],
where={"$or":[{"source":{"$eq":"my_source"}},{"source":{"$eq":"my_source1"}}]})
”where_document“ 过滤 document
where_document={"$contains":"search_string"}
12.更新数据
使用update,如果ids不存在则报错;
使用upsert,如果ids不存在则插入;
更新数据的时候,embeddings不存在则自动生成,embeddings传入需要注意维度一致。
collection.update(
ids=["id1", "id2", "id3", ...],
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
documents=["doc1", "doc2", "doc3", ...],
)
#如不存在则插入
collection.upsert(
ids=["id1", "id2", "id3", ...],
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
documents=["doc1", "doc2", "doc3", ...],
)
13.删除数据
使用ids或者where条件
collection.delete(
ids=["id1", "id2", "id3",...],
where={"chapter": "20"}
)
14.Embeddings
from chromadb.utils import embedding_functions
#默认使用all-MiniLM-L6-v2,维度为384
default_ef = embedding_functions.DefaultEmbeddingFunction()
default_ef ("this is test")
#Sentence Transformers
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
#OpenAI
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="YOUR_API_KEY",
model_name="text-embedding-ada-002"
)
#Cohere
#Instructor models
#Google PaLM API models
#自定义
参考:chroma官网https://docs.trychroma.com/
网友评论