Answers 2.4
Theory
- Purpose of embeddings: convert text into numeric vectors that preserve semantic meaning, enabling computers to “understand” text.
- Semantic similarity: reflected in similar vectors for words/sentences with related meaning in a high‑dimensional space.
- How embeddings are learned: trained on text corpora; word vectors depend on usage context (distributional semantics).
- Embeddings in semantic search: allow retrieving relevant documents by meaning, even without exact keyword matches.
- Matching documents and queries: a document embedding represents overall meaning; a query embedding captures user intent; comparing them reveals relevant matches.
- Vector store: a database for embeddings optimized for fast nearest‑neighbor search.
- Choosing a store: depends on data size, persistence requirements, and purpose (research, prototype, production).
- Chroma for prototyping: convenient for small/in‑memory scenarios (fast), but limited in persistence and scaling.
- Typical pipeline: split text → generate embeddings → index in a vector store → handle query → generate answer.
- Splitting: improves granularity; matching happens at the level of meaningful fragments (chunks), not entire documents.
- Embedding generation: transforms text into vectors suitable for computational comparison.
- Indexing in the store: enables fast retrieval of semantically similar fragments.
- Query handling: create a query embedding and search for similar fragments using metrics (cosine similarity, Euclidean distance, etc.).
- Answer generation: uses the retrieved fragments together with the original query to produce a coherent answer.
- Environment setup: install libraries, configure API keys, and set up for embeddings and the vector store.
- Loading and splitting documents: critical for effective text management and higher‑quality retrieval.
- Illustrating similarity: can be shown via dot product or cosine similarity.
- Chroma specifics: mind the persistence directory, clearing stale data, and correct collection initialization.
- Similarity search: finds the fragments most relevant to the query.
- Typical failures and remediation: duplicates and irrelevant results can be addressed via filtering and careful pipeline tuning.
Practice
1.
def generate_embeddings(sentences):
"""
Generate a simple placeholder embedding for each sentence based on its length.
Args:
- sentences (list of str): List of sentences to embed.
Returns:
- list of int: One embedding per sentence (the sentence length).
"""
return [len(sentence) for sentence in sentences]
def cosine_similarity(vector_a, vector_b):
"""
Compute cosine similarity between two vectors.
Args:
- vector_a (list of float): First vector.
- vector_b (list of float): Second vector.
Returns:
- float: Cosine similarity between `vector_a` and `vector_b`.
"""
dot_product = sum(a*b for a, b in zip(vector_a, vector_b))
magnitude_a = sum(a**2 for a in vector_a) ** 0.5
magnitude_b = sum(b**2 for b in vector_b) ** 0.5
return dot_product / (magnitude_a * magnitude_b)
# Example usage:
sentences = ["Hello, world!", "This is a longer sentence.", "Short"]
embeddings = generate_embeddings(sentences)
print("Embeddings:", embeddings)
vector_a = [1, 2, 3]
vector_b = [2, 3, 4]
similarity = cosine_similarity(vector_a, vector_b)
print("Cosine similarity:", similarity)
2.
def cosine_similarity(vector_a, vector_b):
"""Compute cosine similarity between two vectors."""
dot_product = sum(a*b for a, b in zip(vector_a, vector_b))
magnitude_a = sum(a**2 for a in vector_a) ** 0.5
magnitude_b = sum(b**2 for b in vector_b) ** 0.5
if magnitude_a == 0 or magnitude_b == 0:
return 0 # Avoid division by zero
return dot_product / (magnitude_a * magnitude_b)
3.
class SimpleVectorStore:
def __init__(self):
self.vectors = [] # Initialize empty list to store vectors
def add_vector(self, vector):
"""Add a vector to the store."""
self.vectors.append(vector)
def find_most_similar(self, query_vector):
"""Find and return the vector most similar to `query_vector`."""
if not self.vectors:
return None # Return None if the store is empty
similarities = [cosine_similarity(query_vector, vector) for vector in self.vectors]
max_index = similarities.index(max(similarities))
return self.vectors[max_index]
4.
import sys
def split_text_into_chunks(text, chunk_size):
"""Split the given text into chunks of the specified size."""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def load_and_print_chunks(file_path, chunk_size):
"""Load text from a file, split it into chunks, and print each chunk."""
try:
with open(file_path, 'r') as file:
text = file.read()
chunks = split_text_into_chunks(text, chunk_size)
for i, chunk in enumerate(chunks, 1):
print(f"Chunk {i}:\n{chunk}\n{'-'*50}")
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
except Exception as e:
print(f"Unexpected error: {e}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <file_path> <chunk_size>")
sys.exit(1)
file_path = sys.argv[1]
chunk_size = int(sys.argv[2])
load_and_print_chunks(file_path, chunk_size)
5.
# Assume SimpleVectorStore and cosine_similarity are defined earlier.
def generate_query_embedding(query):
"""
Generate a simple placeholder embedding for the query based on its length.
In real scenarios, you would use a model for embeddings.
"""
return [len(query)]
def query_processing(store, query):
"""
Process a query: generate its embedding, find the most similar fragment in the
vector store, and print it.
"""
query_embedding = generate_query_embedding(query)
most_similar = store.find_most_similar(query_embedding)
if most_similar is not None:
print("Most similar document fragment:", most_similar)
else:
print("No document fragments found.")
6.
def remove_duplicates(document_chunks):
"""Remove duplicate document fragments by exact content match."""
unique_chunks = []
for chunk in document_chunks:
if chunk not in unique_chunks:
unique_chunks.append(chunk)
return unique_chunks
7.
# Initialize SimpleVectorStore for demonstration
store = SimpleVectorStore()
# Placeholder document fragments and their embeddings
document_chunks = ["Document chunk 1", "Document chunk 2", "Document chunk 3"]
# Simulate embeddings based on length
document_embeddings = [[len(chunk)] for chunk in document_chunks]
# Add generated document embeddings to the store
for embedding in document_embeddings:
store.add_vector(embedding)
# Perform similarity search with a sample query
query = "Document"
query_embedding = generate_query_embedding(query)
# Find the most similar document fragments via cosine similarity
similarities = [(cosine_similarity(query_embedding, doc_embedding), idx) for idx, doc_embedding in enumerate(document_embeddings)]
similarities.sort(reverse=True) # Sort by similarity descending
top_n_indices = [idx for _, idx in similarities[:3]] # Indices of top‑3 fragments
# Print IDs or contents of the top‑3 most similar document fragments
print("Top‑3 most similar document fragments:")
for idx in top_n_indices:
print(f"{idx + 1}: {document_chunks[idx]}")
8.
def embed_and_store_documents(document_chunks):
"""
Generate embeddings for each document fragment and store them in SimpleVectorStore.
Args:
- document_chunks (list of str): List of document fragments.
Returns:
- SimpleVectorStore: Vector store initialized with document embeddings.
"""
store = SimpleVectorStore()
for chunk in document_chunks:
# Placeholder embedding based on fragment length
embedding = [len(chunk)]
store.add_vector(embedding)
return store
9.
import json
def save_vector_store(store, filepath):
"""
Save the state of a SimpleVectorStore to the specified file.
Args:
- store (SimpleVectorStore): Vector store to save.
- filepath (str): Path to the output file.
"""
with open(filepath, 'w') as file:
json.dump(store.vectors, file)
def load_vector_store(filepath):
"""
Load a SimpleVectorStore from the specified file.
Args:
- filepath (str): Path to the input file.
Returns:
- SimpleVectorStore: Loaded vector store.
"""
store = SimpleVectorStore()
with open(filepath, 'r') as file:
store.vectors = json.load(file)
return store
def vector_store_persistence():
"""Demonstrate saving and loading the state of SimpleVectorStore."""
store = SimpleVectorStore() # Assume it is already populated
filepath = 'vector_store.json'
# Example of saving and loading
save_vector_store(store, filepath)
loaded_store = load_vector_store(filepath)
print("Vector store loaded with vectors:", loaded_store.vectors)
10.
def evaluate_search_accuracy(queries, expected_chunks):
"""
Evaluate similarity‑search accuracy for a list of queries and expected results.
Args:
- queries (list of str): Query strings.
- expected_chunks (list of str): Expected most similar document fragments for each query.
Returns:
- float: Retrieval accuracy (fraction of correctly found fragments).
"""
correct = 0
# Embed and store documents plus some extras to ensure uniqueness
store = embed_and_store_documents(expected_chunks + list(set(expected_chunks) - set(queries)))
for query, expected in zip(queries, expected_chunks):
query_embedding = generate_query_embedding(query)
most_similar = store.find_most_similar(query_embedding)
# Assume expected_chunks map to embeddings by length in the same way
if most_similar and most_similar == [len(expected)]:
correct += 1
accuracy = correct / len(queries)
return accuracy
# Assume embed_and_store_documents, generate_query_embedding, and SimpleVectorStore
# are implemented as described above.