Merge pull request #12 from jonfairbanks/develop

GitHub RAG Support
jonfairbanks · Feb 27, 2024 · a751c54 · a751c54
2 parents 6049b45 + 0d7bf49
commit a751c54
Show file tree

Hide file tree

Showing 17 changed files with 525 additions and 307 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *.pyc
-data/*
+data/*
+*.log
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.9-slim as base
+FROM python:3.10-slim as base
 
 # Setup env
 ENV LANG C.UTF-8

diff --git a/Pipfile b/Pipfile
@@ -15,4 +15,4 @@ nbconvert = "*"
 [dev-packages]
 
 [requires]
-python_version = "3.9"
+python_version = "3.10"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Ingest files for retrieval augmented generation (RAG) with open-source Large Lan
 ### Pre-Requisites
 
 - A pre-existing Ollama instance
-- Python 3.9+
+- Python 3.10+
 
 ### Setup
 
@@ -36,7 +36,8 @@ Docker:
 - Once complete, ask questions based on your documents!
 
 ### To Do
-- [x] Refactor
+- [x] Refactor into modules
+- [ ] Refactor file processing logic
 - [x] Migrate Chat Stream to Llama-Index
 - [x] Implement Llama-Index Chat Engine with Memory
 - [x] Swap to Llama-Index Chat Engine
@@ -49,15 +50,21 @@ Docker:
     - [ ] chunk_overlap
 - [x] Allow Switching of Embedding Model & Settings
 - [x] Delete Files after Index Created/Failed
-- [ ] Function to Handle GitHub Repo Ingestion
-- [ ] Support for JSON Files
+- [x] Support Additional Import Options
+    - [x] GitHub Repos
+    - [ ] Websites
+- [ ] Remove File Type Limitations for Uploads
 - [x] Show Loaders in UI (File Uploads, Conversions, ...)
 - [x] Export Data (Uploaded Files, Chat History, ...)
 - [x] View and Manage Imported Files
 - [x] About Tab in Sidebar
 - [x] Docker Support
-- [ ] Implement Log Library
+- [x] Implement Log Library
+- [ ] Improve Logging
 - [ ] Re-write Docstrings
+- [ ] Additional Error Handling
+    - [x] Starting a chat without an Ollama model set
+    - [ ] Incorrect GitHub repos
 
 ### Known Issues & Bugs
 - [ ] Refreshing the page loses all state (expected Streamlit behavior; need to implement local-storage)

diff --git a/components/chatbox.py b/components/chatbox.py
@@ -6,9 +6,9 @@
 def chatbox():
     if prompt := st.chat_input("How can I help?"):
         # Prevent submission if Ollama endpoint is not set
-        if not st.session_state["ollama_endpoint"]:
+        if not st.session_state["query_engine"]:
             st.warning(
-                "Please set an Ollama Endpoint under Settings before continuing."
+                "Please confirm settings and upload files before proceeding."
             )
             st.stop()
 

diff --git a/components/page_state.py b/components/page_state.py
@@ -33,7 +33,7 @@ def set_initial_state():
         st.session_state["messages"] = [
             {
                 "role": "assistant",
-                "content": "Hello! Import some files or ingest a GitHub repo and we can get started.",
+                "content": "Welcome to Local RAG! To begin, please either import some files or ingest a GitHub repo. Once you've completed those steps, we can continue the conversation and explore how I can assist you further.",
             }
         ]
 
@@ -65,14 +65,14 @@ def set_initial_state():
         st.session_state["advanced"] = False
 
     if "system_prompt" not in st.session_state:
-        st.session_state[
-            "system_prompt"
-        ] = "You are a sophisticated virtual assistant designed to assist users in comprehensively understanding and extracting insights from a wide range of documents at their disposal. Your expertise lies in tackling complex inquiries and providing insightful analyses based on the information contained within these documents."
+        st.session_state["system_prompt"] = (
+            "You are a sophisticated virtual assistant designed to assist users in comprehensively understanding and extracting insights from a wide range of documents at their disposal. Your expertise lies in tackling complex inquiries and providing insightful analyses based on the information contained within these documents."
+        )
 
     if "top_k" not in st.session_state:
-        st.session_state[
-            "top_k"
-        ] = 3  # Default is 2; increasing to 5 will result in more documents being retrieved
+        st.session_state["top_k"] = (
+            3  # Default is 2; increasing to 5 will result in more documents being retrieved
+        )
 
     if "embedding_model" not in st.session_state:
         st.session_state["embedding_model"] = None

diff --git a/components/sidebar.py b/components/sidebar.py
@@ -2,7 +2,6 @@
 
 from components.tabs.about import about
 from components.tabs.file_upload import file_upload
-from components.tabs.github_repo import github_repo
 from components.tabs.settings import settings
 
 

diff --git a/components/tabs/file_upload.py b/components/tabs/file_upload.py
@@ -6,6 +6,8 @@
 import utils.ollama as ollama
 import utils.llama_index as llama_index
 
+from components.tabs.github_repo import github_repo
+
 
 def file_upload():
     st.title("Directly import your files")
@@ -14,19 +16,43 @@ def file_upload():
 
     # Force users to confirm Settings before uploading files
     if st.session_state["selected_model"] is not None:
-        uploaded_files = st.file_uploader(
-            "Select Files",
-            accept_multiple_files=True,
-            type=("csv", "docx", "epub", "ipynb", "json", "md", "pdf", "ppt", "pptx",),
-        )
+        file_upload_container = st.container(border=True)
+        with file_upload_container:
+            uploaded_files = st.file_uploader(
+                "Select Files",
+                accept_multiple_files=True,
+                type=(
+                    "csv",
+                    "docx",
+                    "epub",
+                    "ipynb",
+                    "json",
+                    "md",
+                    "pdf",
+                    "ppt",
+                    "pptx",
+                ),
+            )
     else:
         st.warning("Please configure Ollama settings before proceeding!", icon="⚠️")
-        uploaded_files = st.file_uploader(
-            "Select Files",
-            accept_multiple_files=True,
-            type=("csv", "docx", "epub", "ipynb", "json", "md", "pdf", "ppt", "pptx",),
-            disabled=True,
-        )
+        file_upload_container = st.container(border=True)
+        with file_upload_container:
+            uploaded_files = st.file_uploader(
+                "Select Files",
+                accept_multiple_files=True,
+                type=(
+                    "csv",
+                    "docx",
+                    "epub",
+                    "ipynb",
+                    "json",
+                    "md",
+                    "pdf",
+                    "ppt",
+                    "pptx",
+                ),
+                disabled=True,
+            )
 
     if len(uploaded_files) > 0:
         st.session_state["file_list"] = uploaded_files
@@ -120,7 +146,9 @@ def file_upload():
 
             if error is not None:
                 status.update(
-                    label="File processing failed.", state="error", expanded=True,
+                    label="File processing failed.",
+                    state="error",
+                    expanded=True,
                 )
                 st.error(error)
             else:
@@ -131,4 +159,4 @@ def file_upload():
                 )
 
     with st.expander("GitHub Repo", expanded=False):
-        st.write(":grey[Coming Soon&trade;]")
+        github_repo()
diff --git a/components/tabs/github_repo.py b/components/tabs/github_repo.py
@@ -1,25 +1,136 @@
+import os
+import shutil
+
 import streamlit as st
 
 import utils.helpers as func
+import utils.ollama as ollama
+import utils.llama_index as llama_index
+import utils.logs as logs
 
 
 def github_repo():
-    st.header("Import files from a GitHub repo")
-    st.caption("Convert a GitHub repo to embeddings for utilization during chat")
-
-    github_container = st.container(border=True)
-    with github_container:
+    # st.header("Import files from a GitHub repo")
+    # st.caption("Convert a GitHub repo to embeddings for utilization during chat")
+    if st.session_state["selected_model"] is not None:
         st.text_input(
-            "GitHub repo",
+            "Select a GitHub.com repo",
             placeholder="jonfairbanks/local-rag",
             key="github_repo",
-            value=st.session_state.github_repo,
-            on_change=func.process_github_repo,
-            args=(st.session_state.github_repo,),
+        )
+
+        repo_processed = None
+        repo_processed = st.button(
+            "Process Repo",
+            on_click=func.clone_github_repo,
+            args=(st.session_state["github_repo"],),
+        )  # TODO: Should this be with st.button?
+
+        with st.spinner("Processing..."):
+            if repo_processed is True:
+                error = None
+
+                ######################################
+                # Create Llama-Index service-context #
+                # to use local LLMs and embeddings   #
+                ######################################
+
+                try:
+                    llm = ollama.create_ollama_llm(
+                        st.session_state["selected_model"],
+                        st.session_state["ollama_endpoint"],
+                    )
+                    st.caption("✔️ LLM Initialized")
+
+                    # resp = llm.complete("Hello!")
+                    # print(resp)
+
+                    # Determine embedding model to use
+
+                    embedding_model = st.session_state["embedding_model"]
+                    hf_embedding_model = None
+
+                    if embedding_model == None:
+                        logs.log.info("No embedding model set; using defaults...")
+                        hf_embedding_model = "BAAI/bge-large-en-v1.5"
+
+                    if embedding_model == "Default (bge-large-en-v1.5)":
+                        logs.log.info("Using default embedding model...")
+                        hf_embedding_model = "BAAI/bge-large-en-v1.5"
+
+                    if embedding_model == "Large (Salesforce/SFR-Embedding-Mistral)":
+                        logs.log.info(
+                            "Using the Salesforce embedding model; RIP yer VRAM..."
+                        )
+                        hf_embedding_model = "Salesforce/SFR-Embedding-Mistral"
+
+                    if embedding_model == "Other":
+                        logs.log.info("Using a user-provided embedding model...")
+                        hf_embedding_model = st.session_state["other_embedding_model"]
+
+                    service_context = llama_index.create_service_context(
+                        llm,
+                        st.session_state["system_prompt"],
+                        hf_embedding_model,
+                        st.session_state["chunk_size"],
+                    )
+                    st.caption("✔️ Context Created")
+                except Exception as err:
+                    logs.log.error(f"Setting up Service Context failed: {err}")
+                    error = err
+
+                #######################################
+                # Load files from the data/ directory #
+                #######################################
+
+                try:
+                    save_dir = os.getcwd() + "/data"
+                    documents = llama_index.load_documents(save_dir)
+                    st.session_state["documents"] = documents
+                    st.caption("✔️ Processed File Data")
+                except Exception as err:
+                    logs.log.error(f"Document Load Error: {err}")
+                    error = err
+
+                ###########################################
+                # Create an index from ingested documents #
+                ###########################################
+
+                try:
+                    llama_index.create_query_engine(documents, service_context)
+                    st.caption("✔️ Created File Index")
+                except Exception as err:
+                    logs.log.error(f"Index Creation Error: {err}")
+                    error = err
+
+                #####################
+                # Remove data files #
+                #####################
+
+                try:
+                    save_dir = os.getcwd() + "/data"
+                    shutil.rmtree(save_dir)
+                    st.caption("✔️ Removed Temp Files")
+                except Exception as err:
+                    logs.log.error(f"Failed to delete data files: {err}")
+                    error = err
+
+                #####################
+                # Show Final Status #
+                #####################
+
+                if error is not None:
+                    st.exception(error)
+                else:
+                    st.write("Your files are ready. Let's chat! 😎")
+
+    else:
+        st.text_input(
+            "Select a GitHub.com repo",
+            placeholder="jonfairbanks/local-rag",
+            disabled=True,
         )
         st.button(
             "Process Repo",
-            on_click=func.process_github_repo,
-            args=(st.session_state.github_repo,),
             disabled=True,
         )
diff --git a/components/tabs/settings.py b/components/tabs/settings.py
@@ -21,10 +21,13 @@ def settings():
             on_change=ollama.get_models,
         )
         st.selectbox(
-            "Model", st.session_state["ollama_models"], key="selected_model",
+            "Model",
+            st.session_state["ollama_models"],
+            key="selected_model",
         )
         st.button(
-            "Refresh", on_click=ollama.get_models,
+            "Refresh",
+            on_click=ollama.get_models,
         )
         if st.session_state["advanced"] == True:
             st.select_slider(

diff --git a/data/.gitkeep b/data/.gitkeep
diff --git a/demo.gif b/demo.gif
diff --git a/main.py b/main.py
@@ -1,3 +1,5 @@
+import time
+
 import streamlit as st
 
 from components.chatbox import chatbox
@@ -7,6 +9,11 @@
 from components.page_config import set_page_config
 from components.page_state import set_initial_state
 
+def generate_welcome_message(msg):
+    for char in msg:
+        time.sleep(0.025) # This is blocking :(
+        yield char
+
 ### Page Setup
 set_page_config()
 set_page_header()
@@ -16,6 +23,7 @@
 
 for msg in st.session_state["messages"]:
     st.chat_message(msg["role"]).write(msg["content"])
+    #st.chat_message(msg["role"]).write_stream(generate_welcome_message(msg['content']))
 
 ### Sidebar
 sidebar()