Skip to content

Commit

Permalink
Merge pull request #12 from jonfairbanks/develop
Browse files Browse the repository at this point in the history
GitHub RAG Support
  • Loading branch information
jonfairbanks committed Feb 27, 2024
2 parents 6049b45 + 0d7bf49 commit a751c54
Show file tree
Hide file tree
Showing 17 changed files with 525 additions and 307 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.pyc
data/*
data/*
*.log
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9-slim as base
FROM python:3.10-slim as base

# Setup env
ENV LANG C.UTF-8
Expand Down
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ nbconvert = "*"
[dev-packages]

[requires]
python_version = "3.9"
python_version = "3.10"
502 changes: 250 additions & 252 deletions Pipfile.lock

Large diffs are not rendered by default.

17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Ingest files for retrieval augmented generation (RAG) with open-source Large Lan
### Pre-Requisites

- A pre-existing Ollama instance
- Python 3.9+
- Python 3.10+

### Setup

Expand All @@ -36,7 +36,8 @@ Docker:
- Once complete, ask questions based on your documents!

### To Do
- [x] Refactor
- [x] Refactor into modules
- [ ] Refactor file processing logic
- [x] Migrate Chat Stream to Llama-Index
- [x] Implement Llama-Index Chat Engine with Memory
- [x] Swap to Llama-Index Chat Engine
Expand All @@ -49,15 +50,21 @@ Docker:
- [ ] chunk_overlap
- [x] Allow Switching of Embedding Model & Settings
- [x] Delete Files after Index Created/Failed
- [ ] Function to Handle GitHub Repo Ingestion
- [ ] Support for JSON Files
- [x] Support Additional Import Options
- [x] GitHub Repos
- [ ] Websites
- [ ] Remove File Type Limitations for Uploads
- [x] Show Loaders in UI (File Uploads, Conversions, ...)
- [x] Export Data (Uploaded Files, Chat History, ...)
- [x] View and Manage Imported Files
- [x] About Tab in Sidebar
- [x] Docker Support
- [ ] Implement Log Library
- [x] Implement Log Library
- [ ] Improve Logging
- [ ] Re-write Docstrings
- [ ] Additional Error Handling
- [x] Starting a chat without an Ollama model set
- [ ] Incorrect GitHub repos

### Known Issues & Bugs
- [ ] Refreshing the page loses all state (expected Streamlit behavior; need to implement local-storage)
Expand Down
4 changes: 2 additions & 2 deletions components/chatbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
def chatbox():
if prompt := st.chat_input("How can I help?"):
# Prevent submission if Ollama endpoint is not set
if not st.session_state["ollama_endpoint"]:
if not st.session_state["query_engine"]:
st.warning(
"Please set an Ollama Endpoint under Settings before continuing."
"Please confirm settings and upload files before proceeding."
)
st.stop()

Expand Down
14 changes: 7 additions & 7 deletions components/page_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def set_initial_state():
st.session_state["messages"] = [
{
"role": "assistant",
"content": "Hello! Import some files or ingest a GitHub repo and we can get started.",
"content": "Welcome to Local RAG! To begin, please either import some files or ingest a GitHub repo. Once you've completed those steps, we can continue the conversation and explore how I can assist you further.",
}
]

Expand Down Expand Up @@ -65,14 +65,14 @@ def set_initial_state():
st.session_state["advanced"] = False

if "system_prompt" not in st.session_state:
st.session_state[
"system_prompt"
] = "You are a sophisticated virtual assistant designed to assist users in comprehensively understanding and extracting insights from a wide range of documents at their disposal. Your expertise lies in tackling complex inquiries and providing insightful analyses based on the information contained within these documents."
st.session_state["system_prompt"] = (
"You are a sophisticated virtual assistant designed to assist users in comprehensively understanding and extracting insights from a wide range of documents at their disposal. Your expertise lies in tackling complex inquiries and providing insightful analyses based on the information contained within these documents."
)

if "top_k" not in st.session_state:
st.session_state[
"top_k"
] = 3 # Default is 2; increasing to 5 will result in more documents being retrieved
st.session_state["top_k"] = (
3 # Default is 2; increasing to 5 will result in more documents being retrieved
)

if "embedding_model" not in st.session_state:
st.session_state["embedding_model"] = None
Expand Down
1 change: 0 additions & 1 deletion components/sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from components.tabs.about import about
from components.tabs.file_upload import file_upload
from components.tabs.github_repo import github_repo
from components.tabs.settings import settings


Expand Down
54 changes: 41 additions & 13 deletions components/tabs/file_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import utils.ollama as ollama
import utils.llama_index as llama_index

from components.tabs.github_repo import github_repo


def file_upload():
st.title("Directly import your files")
Expand All @@ -14,19 +16,43 @@ def file_upload():

# Force users to confirm Settings before uploading files
if st.session_state["selected_model"] is not None:
uploaded_files = st.file_uploader(
"Select Files",
accept_multiple_files=True,
type=("csv", "docx", "epub", "ipynb", "json", "md", "pdf", "ppt", "pptx",),
)
file_upload_container = st.container(border=True)
with file_upload_container:
uploaded_files = st.file_uploader(
"Select Files",
accept_multiple_files=True,
type=(
"csv",
"docx",
"epub",
"ipynb",
"json",
"md",
"pdf",
"ppt",
"pptx",
),
)
else:
st.warning("Please configure Ollama settings before proceeding!", icon="⚠️")
uploaded_files = st.file_uploader(
"Select Files",
accept_multiple_files=True,
type=("csv", "docx", "epub", "ipynb", "json", "md", "pdf", "ppt", "pptx",),
disabled=True,
)
file_upload_container = st.container(border=True)
with file_upload_container:
uploaded_files = st.file_uploader(
"Select Files",
accept_multiple_files=True,
type=(
"csv",
"docx",
"epub",
"ipynb",
"json",
"md",
"pdf",
"ppt",
"pptx",
),
disabled=True,
)

if len(uploaded_files) > 0:
st.session_state["file_list"] = uploaded_files
Expand Down Expand Up @@ -120,7 +146,9 @@ def file_upload():

if error is not None:
status.update(
label="File processing failed.", state="error", expanded=True,
label="File processing failed.",
state="error",
expanded=True,
)
st.error(error)
else:
Expand All @@ -131,4 +159,4 @@ def file_upload():
)

with st.expander("GitHub Repo", expanded=False):
st.write(":grey[Coming Soon™]")
github_repo()
133 changes: 122 additions & 11 deletions components/tabs/github_repo.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,136 @@
import os
import shutil

import streamlit as st

import utils.helpers as func
import utils.ollama as ollama
import utils.llama_index as llama_index
import utils.logs as logs


def github_repo():
st.header("Import files from a GitHub repo")
st.caption("Convert a GitHub repo to embeddings for utilization during chat")

github_container = st.container(border=True)
with github_container:
# st.header("Import files from a GitHub repo")
# st.caption("Convert a GitHub repo to embeddings for utilization during chat")
if st.session_state["selected_model"] is not None:
st.text_input(
"GitHub repo",
"Select a GitHub.com repo",
placeholder="jonfairbanks/local-rag",
key="github_repo",
value=st.session_state.github_repo,
on_change=func.process_github_repo,
args=(st.session_state.github_repo,),
)

repo_processed = None
repo_processed = st.button(
"Process Repo",
on_click=func.clone_github_repo,
args=(st.session_state["github_repo"],),
) # TODO: Should this be with st.button?

with st.spinner("Processing..."):
if repo_processed is True:
error = None

######################################
# Create Llama-Index service-context #
# to use local LLMs and embeddings #
######################################

try:
llm = ollama.create_ollama_llm(
st.session_state["selected_model"],
st.session_state["ollama_endpoint"],
)
st.caption("✔️ LLM Initialized")

# resp = llm.complete("Hello!")
# print(resp)

# Determine embedding model to use

embedding_model = st.session_state["embedding_model"]
hf_embedding_model = None

if embedding_model == None:
logs.log.info("No embedding model set; using defaults...")
hf_embedding_model = "BAAI/bge-large-en-v1.5"

if embedding_model == "Default (bge-large-en-v1.5)":
logs.log.info("Using default embedding model...")
hf_embedding_model = "BAAI/bge-large-en-v1.5"

if embedding_model == "Large (Salesforce/SFR-Embedding-Mistral)":
logs.log.info(
"Using the Salesforce embedding model; RIP yer VRAM..."
)
hf_embedding_model = "Salesforce/SFR-Embedding-Mistral"

if embedding_model == "Other":
logs.log.info("Using a user-provided embedding model...")
hf_embedding_model = st.session_state["other_embedding_model"]

service_context = llama_index.create_service_context(
llm,
st.session_state["system_prompt"],
hf_embedding_model,
st.session_state["chunk_size"],
)
st.caption("✔️ Context Created")
except Exception as err:
logs.log.error(f"Setting up Service Context failed: {err}")
error = err

#######################################
# Load files from the data/ directory #
#######################################

try:
save_dir = os.getcwd() + "/data"
documents = llama_index.load_documents(save_dir)
st.session_state["documents"] = documents
st.caption("✔️ Processed File Data")
except Exception as err:
logs.log.error(f"Document Load Error: {err}")
error = err

###########################################
# Create an index from ingested documents #
###########################################

try:
llama_index.create_query_engine(documents, service_context)
st.caption("✔️ Created File Index")
except Exception as err:
logs.log.error(f"Index Creation Error: {err}")
error = err

#####################
# Remove data files #
#####################

try:
save_dir = os.getcwd() + "/data"
shutil.rmtree(save_dir)
st.caption("✔️ Removed Temp Files")
except Exception as err:
logs.log.error(f"Failed to delete data files: {err}")
error = err

#####################
# Show Final Status #
#####################

if error is not None:
st.exception(error)
else:
st.write("Your files are ready. Let's chat! 😎")

else:
st.text_input(
"Select a GitHub.com repo",
placeholder="jonfairbanks/local-rag",
disabled=True,
)
st.button(
"Process Repo",
on_click=func.process_github_repo,
args=(st.session_state.github_repo,),
disabled=True,
)
7 changes: 5 additions & 2 deletions components/tabs/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ def settings():
on_change=ollama.get_models,
)
st.selectbox(
"Model", st.session_state["ollama_models"], key="selected_model",
"Model",
st.session_state["ollama_models"],
key="selected_model",
)
st.button(
"Refresh", on_click=ollama.get_models,
"Refresh",
on_click=ollama.get_models,
)
if st.session_state["advanced"] == True:
st.select_slider(
Expand Down
Empty file removed data/.gitkeep
Empty file.
Binary file modified demo.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import streamlit as st

from components.chatbox import chatbox
Expand All @@ -7,6 +9,11 @@
from components.page_config import set_page_config
from components.page_state import set_initial_state

def generate_welcome_message(msg):
for char in msg:
time.sleep(0.025) # This is blocking :(
yield char

### Page Setup
set_page_config()
set_page_header()
Expand All @@ -16,6 +23,7 @@

for msg in st.session_state["messages"]:
st.chat_message(msg["role"]).write(msg["content"])
#st.chat_message(msg["role"]).write_stream(generate_welcome_message(msg['content']))

### Sidebar
sidebar()
Expand Down
Loading

0 comments on commit a751c54

Please sign in to comment.