我基于langchain和streamlit开发了一个小应用,用户可以使用pdf文件提问。代码如下所示:
from dotenv import load_dotenvimport streamlit as stfrom PyPDF2 import PdfReaderfrom langchain.text_splitter import CharacterTextSplitterfrom langchain.embeddings.openai import OpenAIEmbeddingsfrom langchain.vectorstores import FAISSfrom langchain.chains.question_answering import load_qa_chainfrom langchain.llms import OpenAIfrom langchain.callbacks import get_openai_callbackdef main(): load_dotenv() st.set_page_config(page_title="Ask your PDF") st.header("Ask your PDF
") # 上传文件 pdf = st.file_uploader("Upload your PDF", type="pdf") # 提取文本 if pdf is not None: pdf_reader = PdfReader(pdf) text = "" for page in pdf_reader.pages: text += page.extract_text() # 分割成块 text_splitter = CharacterTextSplitter( separator="\n", chunk_size=500, chunk_overlap=100, length_function=len ) chunks = text_splitter.split_text(text) # 创建嵌入 embeddings = OpenAIEmbeddings() knowledge_base = FAISS.from_texts(chunks, embeddings) # 显示用户输入 user_question = st.text_input("Ask a question about your PDF:") if user_question: docs = knowledge_base.similarity_search(user_question) llm = OpenAI() chain = load_qa_chain(llm) with get_openai_callback() as cb: response = chain.run(input_documents=docs, question=user_question) print(cb) st.write(response) if __name__ == '__main__': main()
有人能建议一下如何检索或渲染从中提取答案或信息的PDF页面吗?我找到了这个,但无法正确实现它。
回答:
这是一个简单的方法。
- 在读取PDF时,同时保存每页的内容和页码。
# 提取文本 if pdf is not None: pdf_reader = PdfReader(pdf) text = "" page_dict = {} for i, page in enumerate(pdf_reader.pages): page_content = page.extract_text() text += page_content + '\n\n' page_dict[page_content] = i+1
一旦我们得到响应,我们将把它与之前保存的每页内容进行比较。想法是找出哪一页与响应的相似度最高。可能是第1页、第2页等。
# 获取每页与响应之间的相似度。 # 使用spacy模型(免费)。Openai相似度可能更昂贵 # 但可能更准确。 data = [] for page_content, page_num in page_dict.items(): similarity = spacy_sim(response, page_content) data.append([similarity, page_num])
对数据进行排序,获取相似度最高的页面。
# 按相似度从高到低排序。 data = sorted(data, key=lambda x: x[0], reverse=True) print(data) # 获取最高的页码。 top_page_num = data[0][1]
现在使用pdf2image
库生成每页的图像。我们将以图像的形式显示页面的内容。你可以使用其他方法,因为我们已经有了页面的内容。但在这种方法中,我将通过streamlit图像小部件显示图像。
# 生成PDF中每页的图像。 images = convert_from_path(pdf.name)
现在我们有一组图像,获取对应于我们想要显示的页面的索引。
# 显示相似度最高的页面图像。 st.image(images[top_page_num-1])
这是获取页面内容和响应之间相似度得分的代码。
def spacy_sim(str1, str2): """model en_core_web_lg应该更好""" nlp = spacy.load("en_core_web_md") doc_1 = nlp(str1) doc_2 = nlp(str2) return doc_1.similarity(doc_2)
示例输出
你可以从我的谷歌驱动器下载一个示例PDF。
完整代码
from dotenv import load_dotenvimport streamlit as stfrom PyPDF2 import PdfReaderfrom langchain.text_splitter import CharacterTextSplitterfrom langchain.embeddings.openai import OpenAIEmbeddingsfrom langchain.vectorstores import FAISSfrom langchain.chains.question_answering import load_qa_chainfrom langchain.llms import OpenAIfrom langchain.callbacks import get_openai_callbackfrom pdf2image import convert_from_pathimport spacySECRET = 'abc'def spacy_sim(str1, str2): nlp = spacy.load("en_core_web_md") doc_1 = nlp(str1) doc_2 = nlp(str2) return doc_1.similarity(doc_2)def main(): load_dotenv() st.set_page_config(page_title="Ask your PDF") st.header("Ask your PDF
") # 上传文件,应与streamlit脚本在同一位置。 pdf = st.file_uploader("Upload your PDF", type="pdf") # 提取文本 if pdf is not None: pdf_reader = PdfReader(pdf) text = "" page_dict = {} for i, page in enumerate(pdf_reader.pages): page_content = page.extract_text() text += page_content + '\n\n' page_dict[page_content] = i+1 # 分割成块 text_splitter = CharacterTextSplitter( separator="\n", chunk_size=500, chunk_overlap=100, length_function=len ) chunks = text_splitter.split_text(text) # 创建嵌入 embeddings = OpenAIEmbeddings(openai_api_key=SECRET) knowledge_base = FAISS.from_texts(chunks, embeddings) # 显示用户输入 user_question = st.text_input("Ask a question about your PDF:") if user_question: docs = knowledge_base.similarity_search(user_question) llm = OpenAI(openai_api_key=SECRET) chain = load_qa_chain(llm) with get_openai_callback() as cb: response = chain.run(input_documents=docs, question=user_question) print(f'billing details: {cb}') # 获取每页与响应之间的相似度。 # 使用spacy模型(免费)。Openai相似度可能更昂贵 # 但可能更准确。 data = [] for page_content, page_num in page_dict.items(): similarity = spacy_sim(response, page_content) data.append([similarity, page_num]) # 按相似度从高到低排序。 data = sorted(data, key=lambda x: x[0], reverse=True) print(data) # 获取最高的页码。 top_page_num = data[0][1] st.write(f"Answer: {response}") # 生成PDF中每页的图像。 images = convert_from_path(pdf.name) # 显示相似度最高的页面图像。 st.image(images[top_page_num-1])if __name__ == '__main__': main()
使用openai api的相似度。
import openaiopenai.api_key = SECRETdef openai_sim(str1, str2): # 调用API response = openai.Embedding.create( input=[str1, str2], model="text-embedding-ada-002" ) # 提取嵌入 embedding1 = response['data'][0]['embedding'] embedding2 = response['data'][1]['embedding'] # 计算余弦相似度 similarity_score = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) return similarity_score
使用sentence-transfomer进行相似度计算。
def transformer_sim(str1, str2): """ 安装pytorch: https://pytorch.org/get-started/locally/ 安装sentence-transformers: pip install -U sentence-transformers from sentence_transformers import SentenceTransformer, util """ model = SentenceTransformer('all-MiniLM-L6-v2') embeddings1 = model.encode(str1, convert_to_tensor=True) embeddings2 = model.encode(str2, convert_to_tensor=True) cosine_score = util.cos_sim(embeddings1, embeddings2) simscore = float(cosine_score[0][0]) return simscore
解决方案2
使用pymupdf保存文本并保存每页的图像。上传的文件可以来自任何地方,不一定是streamlit脚本的位置,因为在我们保存每个PDF页面的文本时,我们也将图像保存为数据字节。
这也使用sentence-transformer来测量两个文本字符串的相似度,这对于页面内容和响应的比较很有用。
完整代码
"""使用sentence-transfomer进行相似度得分。"""from dotenv import load_dotenvimport streamlit as stfrom langchain.text_splitter import CharacterTextSplitterfrom langchain.embeddings.openai import OpenAIEmbeddingsfrom langchain.vectorstores import FAISSfrom langchain.chains.question_answering import load_qa_chainfrom langchain.llms import OpenAIfrom langchain.callbacks import get_openai_callbackfrom sentence_transformers import SentenceTransformer, utilimport fitz # pymupdfSECRET = 'abc'def transformer_sim(str1, str2): """ 安装pytorch: https://pytorch.org/get-started/locally/ 安装sentence-transformers: pip install -U sentence-transformers from sentence_transformers import SentenceTransformer, util """ model = SentenceTransformer('all-MiniLM-L6-v2') embeddings1 = model.encode(str1, convert_to_tensor=True) embeddings2 = model.encode(str2, convert_to_tensor=True) cosine_score = util.cos_sim(embeddings1, embeddings2) simscore = float(cosine_score[0][0]) return simscoredef main(): load_dotenv() st.set_page_config(page_title="Ask your PDF") st.header("Ask your PDF
") # 上传文件,应与streamlit脚本在同一位置。 pdf = st.file_uploader("Upload your PDF", type="pdf") # 提取文本 if pdf is not None: text = "" images = [] page_dict = {} with fitz.open(stream=pdf.read(), filetype="pdf") as pdf_pages: for i, page in enumerate(pdf_pages): page_content = page.get_text() text += page_content + '\n\n' page_dict[page_content] = i+1 # 图像 pix = page.get_pixmap() bytes_data = pix.tobytes("PNG") images.append(bytes_data) # 分割成块 text_splitter = CharacterTextSplitter( separator="\n", chunk_size=500, chunk_overlap=100, length_function=len ) chunks = text_splitter.split_text(text) # 创建嵌入 embeddings = OpenAIEmbeddings(openai_api_key=SECRET) knowledge_base = FAISS.from_texts(chunks, embeddings) # 显示用户输入 user_question = st.text_input("Ask a question about your PDF:") if user_question: docs = knowledge_base.similarity_search(user_question) llm = OpenAI(openai_api_key=SECRET) chain = load_qa_chain(llm) with get_openai_callback() as cb: response = chain.run(input_documents=docs, question=user_question) print(f'billing details: {cb}') # 获取每页与响应之间的相似度。 data = [] for page_content, page_num in page_dict.items(): similarity = transformer_sim(response, page_content) data.append([similarity, page_num, page_content]) # 按相似度从高到低排序。 data = sorted(data, key=lambda x: x[0], reverse=True) # 获取最高的页码。 top_page_num = data[0][1] top_sim_score = data[0][0] st.write(f"Answer: {response}") st.markdown(f'**There is a top similarity score of {top_sim_score} that the response is from page {top_page_num}**') # 显示相似度最高的页面图像。 st.image(images[top_page_num-1])if __name__ == '__main__': main()