-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlesson_1_first_pipeline.py
More file actions
108 lines (88 loc) · 4.44 KB
/
lesson_1_first_pipeline.py
File metadata and controls
108 lines (88 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import structlog
from getpass import getpass
from haystack import Document
from haystack import Pipeline
from datasets import load_dataset
from haystack.components.builders import PromptBuilder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import (
SentenceTransformersDocumentEmbedder,
SentenceTransformersTextEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.generators import OpenAIGenerator
try:
# A DocumentStore stores the Documents that the question answering system uses to find answers to your questions
print("Initializing In-memory document store")
document_store = InMemoryDocumentStore()
# Load Data set. The data has already been cleaned and split into tokens
print("Loading 7 wonders data set")
dataset = load_dataset("bilgeyucel/seven-wonders", split="train")
print("Creating haystack documents from training data")
docs = [Document(content=doc["content"], meta=doc["meta"]) for doc in dataset]
# Embedders in Haystack transform texts or Documents into vector representations using pre-trained models.
# You can then use the embeddings in your pipeline for tasks like question answering, information retrieval, and more.
print("Initisialsing document embedder")
doc_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
# all warm_up() to download the embedding model.
doc_embedder.warm_up()
print("Embedding documents")
docs_with_embeddings = doc_embedder.run(docs)
print("Writing embeddings to document store")
document_store.write_documents(docs_with_embeddings["documents"])
# RAG PIPELINE
# Previously, we created an embeddign for the documents that the question will be answered from
# Now we're going to create an embedding for the actual question being asked.
# Same model should be used for embedding the document as embedding the query.
print("Initisialsing text embedder")
text_embedder = SentenceTransformersTextEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
# Initialize a InMemoryEmbeddingRetriever and make it use the InMemoryDocumentStore you initialized earlier in this tutorial.
# This Retriever will get the relevant documents to the query.
print("Initisialsing in-memory retriever for embeddings in document store")
retriever = InMemoryEmbeddingRetriever(document_store)
# Create a custom prompt for a generative question answering task using the RAG approach.
# The prompt should take in two parameters: documents, which are retrieved from a document store, and a question from the user.
# Use the Jinja2 looping syntax to combine the content of the retrieved documents in the prompt
template = """
Given the following information, answer the question.
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
prompt_builder = PromptBuilder(template=template)
print("Looking for OpenAI API Key")
# A generator generates answers to questions
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
generator = OpenAIGenerator(model="gpt-3.5-turbo")
basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)
# Now, connect the components to each other
# Question: Why do we need these connections? To do: make a diagram to make sense of this part
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")
question = "What does Rhodes Statue look like?"
print("Asking the question")
response = basic_rag_pipeline.run(
{"text_embedder": {"text": question}, "prompt_builder": {"question": question}}
)
reply = response["llm"]["replies"][0]
print(f"The answer is '{reply}'")
except Exception as e:
print("Failed to answer question", exc_info=e)
finally:
# Can we delete the dataset or something?
pass