import gradio as gr from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader, UnstructuredWordDocumentLoader, \ Docx2txtLoader, UnstructuredExcelLoader from dotenv import load_dotenv from langchain_core.output_parsers import StrOutputParser from langchain_ibm import WatsonxEmbeddings from langchain_ollama import OllamaEmbeddings from pathlib import Path from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_chroma import Chroma import os import shutil # 모델(LLM, Embeddding) load_dotenv() apikey = os.getenv("WATSONX_API_KEY") project_id = os.getenv("WATSONX_PROJECT_ID") watsonx_ai_url = os.getenv("WATSONX_URL") watson_embedding = WatsonxEmbeddings( model_id="ibm/granite-embedding-278m-multilingual", url = f"{watsonx_ai_url}", api_key = f"{apikey}", project_id=f"{project_id}" ) ollama_embedding = OllamaEmbeddings(model="nomic-embed-text-v2-moe") splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) LOADERS = { ".pdf" : PyPDFLoader, ".csv" : CSVLoader, ".docx" : UnstructuredWordDocumentLoader, ".xlsx" : UnstructuredExcelLoader, ".txt" : TextLoader, } CHROMA_DIR = "./db/chroma" COLLECTION_NAME = "job_rag" CHUNKS_PATH = "./db/chunks.pkl" DOCUMENTS = [] CHUNKS = [] VECTORSTORE = None # ========== # Tap 1 - 기능 구현 # ========== def extract_metadata(file_path): # 2026 상 삼성 E&A 직무기술서 # {year:2026, recruitment_period:상반기, company:삼성E&A, file_name:2026 상 삼성E&A 직무기술서} # 확장자를 제외한 파일명 name = file_path.name datas = name.split() return { "year": int(datas[0]), "recruitment_period": datas[1] + "반기", "company": datas[2], "file_name": name } def upload_files(files): """ 여러 개의 파일(pdf, csv)이 업로드 될 때 각 파일을 load() 한 결과는 DOCUMENTS 추가 몇 개의 문서가 업로드 되었는지 리턴 확장자 분리 """ global DOCUMENTS all_docs = [] for file in files: # 파일명 가져오기 path = Path(file.name) # 확장자 가져오기 ext = path.suffix.lower() loader = LOADERS[ext](file.name) docs = loader.load() # metadata 정리 meta_info = extract_metadata(path) # metadata 업데이트 for doc in docs: doc.metadata.update(meta_info) all_docs.extend(docs) DOCUMENTS = all_docs return f"문서 수 : {len(all_docs)}" def preview_chunks(): global DOCUMENTS global CHUNKS if not DOCUMENTS: return "문서가 없음." # 전체문서는 DOCUMENTS 안에 있음 # 분리 CHUNKS = splitter.split_documents(DOCUMENTS) # 청크 10개 까지만 내용 출력 preview = [] for i, chunk in enumerate(CHUNKS[:10]): preview.append(f"""[CHUNK {i + 1}]{chunk.page_content[:100]}\n """) return "\n\n".join(preview) def build_vectorstore(): global VECTORSTORE global CHUNKS if not CHUNKS: return "먼저 CHUNK를 생성하세요." # 기존의 VECTORSTORE가 있다면 제거 if Path(CHROMA_DIR).exists(): shutil.rmtree(CHROMA_DIR) VECTORSTORE = Chroma.from_documents(documents=CHUNKS, embedding=watson_embedding, persist_directory=CHROMA_DIR, collection_name=COLLECTION_NAME ) return f""" 생성 완료 Chunk: {len(CHUNKS)} Vector: {VECTORSTORE._collection.count()} """ # ========== # Gradio UI # ========== with gr.Blocks() as app: gr.Markdown("# 사내 문서 RAG") with gr.Tab("문서관리"): files = gr.File(file_count = "multiple") upload_btn = gr.Button("문서 업로드") upload_status = gr.Textbox() upload_btn.click(upload_files, files, upload_status) chunk_btn = gr.Button("chunk 확인") chunk_preview = gr.Textbox(lines = 20) chunk_btn.click(preview_chunks, outputs = chunk_preview) vector_btn = gr.Button("vector DB 생성") vector_status = gr.Textbox() vector_btn.click(build_vectorstore, outputs = vector_status) with gr.Tab("검색 테스트"): pass with gr.Tab("RAG 채팅"): pass pass if __name__ =="__main__": app.launch()