RAG, MMR, 임베딩 심화

2026-05-29 18:35:36 +09:00
parent b23b5e9b5f
commit d1db36883d
44 changed files with 3579 additions and 70 deletions
@@ -0,0 +1,197 @@
+from langchain_ollama import ChatOllama
+from langchain_ibm import ChatWatsonx
+from langchain_core.prompts import (
+    PromptTemplate,
+    ChatPromptTemplate,
+    MessagesPlaceholder,
+)
+from langchain_core.output_parsers import (
+    StrOutputParser,
+    JsonOutputParser,
+    PydanticOutputParser,
+)
+from langchain_core.runnables import (
+    RunnablePassthrough,
+    RunnableParallel,
+    RunnableLambda,
+)
+from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
+from langchain_core.chat_history import (
+    InMemoryChatMessageHistory,
+    BaseChatMessageHistory,
+)
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from pydantic import BaseModel, Field
+from typing import Literal
+from dotenv import load_dotenv
+import os
+import gradio as gr
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    CSVLoader,
+    WebBaseLoader,
+    DirectoryLoader,
+)
+from youtube_transcript_api import YouTubeTranscriptApi
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings
+from langchain_ibm import WatsonxEmbeddings
+from langchain_chroma import Chroma
+from langchain_community.vectorstores import FAISS
+
+# 모델(LLM, Embeddding)
+load_dotenv()
+
+apikey = os.getenv("WATSONX_API_KEY")
+project_id = os.getenv("WATSONX_PROJECT_ID")
+watsonx_ai_url = os.getenv("WATSONX_URL")
+
+ollama_embedding = OllamaEmbeddings(model="nomic-embed-text-v2-moe")
+watson_llm = ChatWatsonx(
+    model_id="ibm/granite-4-h-small",
+    url=f"{watsonx_ai_url}",
+    api_key=f"{apikey}",
+    project_id=f"{project_id}",
+    max_tokens=2000,
+    params={"temperature": 0},
+)
+
+# qwen_llm = ChatOllama(model="qwen3.5:4b", temperature=0)
+
+
+# 1단계
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return ("PDF 파일을 업로드 해주세요.", "", "", "", "")
+
+    # PDF 로드
+    loader = PyPDFLoader(pdf_file)
+    docs = loader.load()
+
+    # 총 페이지수
+    total_pages = len(docs)
+
+    # 첫 페이지 내용
+    first_page_content = docs[0].page_content[:1000]
+
+    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
+    chunks = splitter.split_documents(docs)
+    # 총 chunk 수
+    total_chunks = len(chunks)
+
+    # 첫 번째 chunk
+    first_chunk_content = chunks[0].page_content
+
+    # 첫 번째 Chunk Metadata
+    first_chunk_metadata = chunks[0].metadata
+
+    return (
+        f"총 페이지 수 : {total_pages}",
+        first_page_content,
+        total_chunks,
+        first_chunk_content,
+        first_chunk_metadata,
+    )
+
+
+# 2단계
+def rag_chat(pdf_file, question):
+    if pdf_file is None:
+        return ("PDF 파일을 업로드 해주세요.", "")
+
+    # 1. PDF 로드
+    loader = PyPDFLoader(pdf_file)
+    docs = loader.load()
+
+    # 2. 분할
+    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
+    split_docs = splitter.split_documents(docs)
+
+    # 3. 임베딩
+    faiss_store = FAISS.from_documents(documents=split_docs, embedding=ollama_embedding)
+
+    # 4. 검색(retriever)
+    retriever = faiss_store.as_retriever(search_kwargs={"k": 3})
+    retriever_docs = retriever.invoke(question)
+
+    # 5. Context 생성
+    context = "\n\n".join([doc.page_content for doc in retriever_docs])
+
+    ### LLM
+    # 1. prompt
+    message = """\
+당신은 PDF 기반 RAG AI 입니다.
+다음 문서를 참고해서 질문에 답변하세요.
+
+문서:
+{context}
+
+질문:
+{question}
+"""
+
+    rag_prompt = ChatPromptTemplate.from_template(message)
+    # 2. chain
+    chain = rag_prompt | watson_llm | StrOutputParser()
+
+    # 3. answer
+    answer = chain.invoke({"context": context, "question": question})
+
+    # 4. 답변, rag 결과 반환
+    retrieved_text = ""
+    for i, doc in enumerate(retriever_docs, 1):
+        retrieved_text += f"""
+        [검색 문서 {i}]
+
+        내용:
+        {doc.page_content}
+
+        metadata:
+        {doc.metadata}
+        {'='*50}
+"""
+
+    return retrieved_text, answer
+
+
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF RAG 학습 앱")
+
+    with gr.Tabs():
+        with gr.Tab("1단계 - PDF & Chunk 확인"):
+            # 파일 업로드 컴포넌트
+            pdf_input = gr.File(label="PDF 업로드", file_types=[".pdf"])
+            btn1 = gr.Button("분석 시작")
+            # textbox 5개
+            page_output = gr.Textbox(label="총 페이지 수")
+            first_output = gr.Textbox(label="첫 페이지 내용", lines=10)
+            chunk_output = gr.Textbox(label="총 chunk 수")
+            first_chunk_output = gr.Textbox(label="첫 번째 chunk", lines=10)
+            metadata_output = gr.Textbox(label="첫 번째 Chunk Metadata", lines=5)
+
+            btn1.click(
+                fn=process_pdf,
+                inputs=[pdf_input],
+                outputs=[
+                    page_output,
+                    first_output,
+                    chunk_output,
+                    first_chunk_output,
+                    metadata_output,
+                ],
+            )
+        with gr.Tab("2단계 - RAG QA"):
+            # 파일 업로드 컴포넌트
+            pdf_input = gr.File(label="PDF 업로드", file_types=[".pdf"])
+            question_input = gr.Textbox(label="질문 입력")
+            btn1 = gr.Button("질문하기")
+            retrieved_output = gr.Textbox(label="검색된 chunk", lines=20)
+            answer_output = gr.Textbox(label="최종 답변", lines=10)
+
+            btn1.click(
+                fn=rag_chat,
+                inputs=[pdf_input, question_input],
+                outputs=[retrieved_output, answer_output],
+            )
+demo.launch()