Spaces:

Redmind
/

Agentic_AI

Sleeping

App Files Files Community

Redmind commited on Dec 13, 2024

Commit

461c9df

verified ·

1 Parent(s): c0b6a01

Upload file_upload.py

Browse files

Files changed (1) hide show

file_upload.py +133 -0

file_upload.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import hashlib
+import json
+import pandas as pd
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from PyPDF2 import PdfReader
+from docx import Document
+class FileHandler:
+    def __init__(self, vector_db_path, open_api_key, grok_api_key):
+        self.vector_db_path = vector_db_path
+        self.openai_embeddings = OpenAIEmbeddings(api_key=open_api_key)
+        self.grok_api_key = grok_api_key
+    def handle_file_upload(self, file_name, file_content):
+        try:
+            # Debug the type of the file object
+            # Extract the base file name
+            base_file_name = os.path.basename(file_name)
+            # Replace spaces with underscores and make the name lowercase
+            formatted_file_name = base_file_name.replace(" ", "_").lower()
+            file_content_encode = file_content.encode('utf-8')
+            file_hash = hashlib.md5(file_content_encode).hexdigest()
+            file_key = f"{formatted_file_name}_{file_hash}"
+            vector_store_dir = os.path.join(self.vector_db_path, file_key)
+            os.makedirs(vector_store_dir, exist_ok=True)
+            vector_store_path = os.path.join(vector_store_dir, "index.faiss")
+            if os.path.exists(vector_store_path):
+                return {"message": "File already processed."}
+            # Process file based on type
+            if file_name.endswith(".pdf"):
+                texts, metadatas = self.load_and_split_pdf(file_content)
+            elif file_name.endswith(".docx"):
+                texts, metadatas = self.load_and_split_docx(file_content)
+            elif file_name.endswith(".txt"):
+                texts, metadatas = self.load_and_split_txt(file_content)
+            elif file_name.endswith(".xlsx"):
+                texts, metadatas = self.load_and_split_table(file_content)
+            elif file_name.endswith(".csv"):
+                texts, metadatas = self.load_and_split_csv(file_content)
+            else:
+                raise ValueError("Unsupported file format.")
+            if not texts:
+                return {"message": "No text extracted from the file. Check the file content."}
+            # # Generate embeddings using Grok API
+            vector_store = FAISS.from_texts(texts, self.openai_embeddings, metadatas=metadatas)
+            vector_store.save_local(vector_store_dir)
+            metadata = {
+                "filename": file_name,
+                "file_size": len(file_content),
+            }
+            metadata_path = os.path.join(vector_store_dir, "metadata.json")
+            with open(metadata_path, 'w') as md_file:
+                json.dump(metadata, md_file)
+            return {"message": "File processed successfully."}
+        except Exception as e:
+            return {"message": f"Error processing file: {str(e)}"}
+    def load_and_split_pdf(self, file):
+        reader = PdfReader(file)
+        texts = []
+        metadatas = []
+        for page_num, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text:
+                texts.append(text)
+                metadatas.append({"page_number": page_num + 1})
+        return texts, metadatas
+    def load_and_split_docx(self, file):
+        doc = Document(file)
+        texts = []
+        metadatas = []
+        for para_num, paragraph in enumerate(doc.paragraphs):
+            if paragraph.text:
+                texts.append(paragraph.text)
+                metadatas.append({"paragraph_number": para_num + 1})
+        return texts, metadatas
+    def load_and_split_txt(self, content):
+        text = content.decode("utf-8")
+        lines = text.split('\n')
+        texts = [line for line in lines if line.strip()]
+        metadatas = [{}] * len(texts)
+        return texts, metadatas
+    def load_and_split_table(self, content):
+        excel_data = pd.read_excel(content, sheet_name=None)
+        texts = []
+        metadatas = []
+        for sheet_name, df in excel_data.items():
+            df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
+            df = df.fillna('N/A')
+            for _, row in df.iterrows():
+                row_dict = row.to_dict()
+                # Combine key-value pairs into a string
+                row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
+                texts.append(row_text)
+                metadatas.append({"sheet_name": sheet_name})
+        return texts, metadatas
+    def load_and_split_csv(self, content):
+        print('its csv')
+        csv_data = pd.read_csv(content)
+        print(csv_data)
+        texts = []
+        metadatas = []
+        csv_data = csv_data.dropna(how='all', axis=0).dropna(how='all', axis=1)
+        csv_data = csv_data.fillna('N/A')
+        for _, row in csv_data.iterrows():
+            row_dict = row.to_dict()
+            row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
+            texts.append(row_text)
+            metadatas.append({"row_index": _})
+        print(texts)
+        return texts, metadatas