Spaces:
Sleeping
Sleeping
File size: 4,108 Bytes
4e13ba0 c0cb811 4e13ba0 fcd815e c0cb811 4e13ba0 c0cb811 4e13ba0 c0cb811 173cb04 45001af c0cb811 ead3409 c0cb811 ead3409 4e13ba0 c0cb811 4e13ba0 ead3409 4e13ba0 c0cb811 ead3409 4e13ba0 c0cb811 4e13ba0 fcd815e 4e13ba0 c0cb811 4e13ba0 ead3409 c0cb811 ead3409 fcd815e c0cb811 fcd815e c0cb811 173cb04 45001af c0cb811 ead3409 72523e8 ead3409 c0cb811 a74d897 c0cb811 ead3409 c0cb811 fcd815e c0cb811 fcd815e ead3409 a74d897 c0cb811 ead3409 c0cb811 ead3409 c0cb811 ead3409 c0cb811 ead3409 4e13ba0 c0cb811 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
import fitz
import re
import faiss
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
# ===============================
# MODEL LOADING
# ===============================
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
LLM_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
llm = AutoModelForCausalLM.from_pretrained(
LLM_NAME,
torch_dtype=torch.float32
)
llm.eval()
# ===============================
# PDF PROCESSING
# ===============================
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def clean_text(text):
return re.sub(r"\s+", " ", text).strip()
def chunk_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
# ===============================
# VECTOR DB (FAISS)
# ===============================
def build_faiss_index(chunks):
embeddings = embedding_model.encode(chunks)
embeddings = np.array(embeddings).astype("float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index, chunks
def retrieve_relevant_chunks(query, index, chunks, top_k=3):
query_embedding = embedding_model.encode([query]).astype("float32")
_, indices = index.search(query_embedding, top_k)
return [chunks[i] for i in indices[0]]
# ===============================
# LLM ANSWER
# ===============================
def generate_answer(question, context_chunks):
context = "\n\n".join(context_chunks)
prompt = f"""
Answer the question strictly using the given context.
If the answer is not found, say:
"Information not found in the document."
Context:
{context}
Question:
{question}
Answer:
"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
with torch.no_grad():
output = llm.generate(
**inputs,
max_new_tokens=200,
temperature=0.2
)
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
return decoded.split("Answer:")[-1].strip()
# ===============================
# MAIN PIPELINE
# ===============================
def pdf_rag_chat(pdf_file, question):
if pdf_file is None or question.strip() == "":
return "Please upload a PDF and enter a question."
text = extract_text_from_pdf(pdf_file.name)
text = clean_text(text)
chunks = chunk_text(text)
index, chunks = build_faiss_index(chunks)
context = retrieve_relevant_chunks(question, index, chunks)
return generate_answer(question, context)
# ===============================
# GRADIO UI (GRADIO 6 SAFE)
# ===============================
with gr.Blocks() as demo:
gr.Markdown("""
# π PDF RAG Chatbot (Open-Source AI)
Upload a **PDF** and ask questions based **only on its content**.
Built using **Retrieval Augmented Generation (RAG)** and
**open-source Hugging Face models**, running on **free CPU**.
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="π€ Upload PDF",
file_types=[".pdf"]
)
question_input = gr.Textbox(
label="β Ask a question",
placeholder="e.g. What is the objective of the project?",
lines=2
)
submit_btn = gr.Button("π Get Answer")
with gr.Column(scale=2):
answer_output = gr.Textbox(
label="π Answer",
lines=10
)
submit_btn.click(
fn=pdf_rag_chat,
inputs=[pdf_input, question_input],
outputs=answer_output
)
gr.Markdown("""
---
**Β© Simranpreet Kaur**
**NIELIT Ropar | AIML Six Months Training | 2026**
""")
demo.launch()
|