langchain提供了多个向量数据库的使用,如Pinecone ,Chroma等, 最火的Pinecone 实际使用起来总感觉不太稳定,而且官网申请key有时候还不太方便,今天研究了下Faiss向量数据库的操作
引入相关包
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredWordDocumentLoader,PyPDFium2Loader,DirectoryLoader,PyPDFLoader,TextLoader
from keys import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV,OPENAI_API_BASE
import os
import json
加载数据到langchain,如上文使用的加载txt方法
##增加嵌入模型,api_base调用
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY,openai_api_base= OPENAI_API_BASE)
def documents2dict(documents):
# 将Document对象列表转换为字典
documents_dict = [
{'page_content': document.page_content, 'metadata': document.metadata}
for document in documents
]
return documents_dict
def save_documents(documents,index = "faiss_index"):
print("documents:",documents)
docs = text_splitter.split_documents(documents)
print("docs:",docs)
db = FAISS.from_documents(docs, embeddings)
db.save_local(index)
return db
def get_documents(index="faiss_index", query="", limit=3):
db = FAISS.load_local(index, embeddings)
docs = db.similarity_search(query, k=limit)
txts = documents2dict(docs)
print("txts:",txts)
return txts
directory_path = 'content'
document_list = load_txt(directory_path)
##循环加载数据到faiss
for documents in document_list:
save_documents(documents)
搜索相似数据转为字典
index="faiss_index"
query = "What did the president say about Ketanji Brown Jackson"
txts = get_documents(index,query,3)
print(txts)
``
网友评论