Codebase-RAG/codebase_rag.py at master · lalva224/Codebase-RAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# -*- coding: utf-8 -*-
"""Codebase RAG.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1r-_PXjAV8EHzDtVc0mJHtZsiWDP1jJ4j
"""

# pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence_transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from dotenv import load_dotenv
load_dotenv()
#make sure to try out openai embeddings

#clone github repo and save it within colab
def clone_repo(repo_url):
  repo_name = repo_url.split('/')[-1]
  current_directory = os.getcwd()
  clone_path_cwd = os.path.join(current_directory, repo_name)
  Repo.clone_from(repo_url,clone_path_cwd)

  return clone_path_cwd

# path = clone_repo('https://github.com/lalva224/SecureAgent_lalva224')

# path = '/content/SecureAgent_lalva224'

#avoid parsing irrelevant files.
# SUPPORTED_EXTENSIONS = {'.py','.js','.ts','.tsx','.jsx','.java','.cpp'}
# #directories we want to ignore completely, such as node modules.
# IGNORED_DIRS = {'node_modules','.git','dist','__pycache__','.next','.vscode','.env','venv','virtual'}

def get_file_content(file_path,clone_path_cwd):
  try:
    with open(file_path,'r',encoding='utf-8') as f:
      #read content (program file) inside file path
      content = f.read()
      #get file relative to google colab
      rel_path = os.path.relpath(file_path,clone_path_cwd)
      print(rel_path)
      return {
          'name':clone_path_cwd,
          'content':content
      }
  except Exception as e:
      print(f'Error reading file {file_path}: {e}')
      return None

def get_main_files(clone_path_cwd:str):
  SUPPORTED_EXTENSIONS = {'.py','.js','.ts','.tsx','.jsx','.java','.cpp'}
  #directories we want to ignore completely, such as node modules.
  IGNORED_DIRS = {'node_modules','.git','dist','__pycache__','.next','.vscode','.env','venv','virtual'}
  """
  Get content of supported code files from the local repository:
  Args:
  repo_path: Path to the local repository

  Returns:
  List of dictionaries containing file names and contents
  """
  files_content = []
  try:
    #os.walk gets root, dir and files from the repo_path. root is current directory path
    print(clone_path_cwd)
    for root,_, files in os.walk(clone_path_cwd):
      print(root,files)
      #skip if current directory is in ignored directories
      #any keyword??
      # if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
      #   continue
      if root in IGNORED_DIRS:
        continue


      #Process each file in current directory
      for file in files:
        file_path = os.path.join(root,file)
        print(file_path)
        #split into rootand extension
        if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
          file_content  = get_file_content(file_path,clone_path_cwd)
          print(file_content)
          if file_content:
            files_content.append(file_content)
  except Exception as e:
    print(f'Error processing files: {e}')
  return files_content

#


#chunk splitter could be a challenge. Extract every single function within a file and extract those
#also look at creating doc strings stating what each function does, params, and return type.
#Also try to make it different by language, at least Python, Javascript, Typescript, Java
def get_hugging_face_embeddings(text,model_name='sentence-transformers/all-mpnet-base-v2'):
  model = SentenceTransformer(model_name)
  return model.encode(text)

# text = 'I am a programmer'
# embeddings = get_hugging_face_embeddings(text)

def create_pinecone_namespace(clone_path_cwd,repo_url:str):
  file_content = get_main_files(clone_path_cwd)
  pinecone_api_key = os.getenv('PINECONE_API_KEY')
  #in colab this is neccessary bc some libraries expect the api key to come from system level .envs
  # os.environ['PINECONE_API_KEY'] = pinecone_api_key

  pc = Pinecone(api_key=pinecone_api_key)
  pinecone_index = pc.Index('codebase-rag')

  #this comes from langchain and it provides additional integrations we can use, like sending documents.

  documents = []
  for file in file_content:
    doc = Document(
        page_content = f"{file['content']}\n {file['name']}",
        #meta data is useful for filtering, to know what each entry is.
        metadata = {"source":file['name']}
    )
    documents.append(doc)
  print(documents)
  #embed using openai embeddings
  client = OpenAI(
      base_url = 'https://api.groq.com/openai/v1',
      api_key = os.getenv('GROQ_API_KEY')
  )

  #documents are stored as pieces of text inside vector store. vector store is a database of embeddings being stored to pinecone or elsewhere.
  vectorstore = PineconeVectorStore.from_documents(
      documents = documents,
      embedding = HuggingFaceEmbeddings(),
      index_name = 'codebase-rag',
      #namespace is like a sub division within an index.
      namespace = repo_url
  )

  print('upserted into pinecone')

  stats =  pinecone_index.describe_index_stats()
  print(stats)


def perform_rag(query,pinecone_namespace):
  print('performing rag')
  print(query)
  print(pinecone_namespace)
  client = OpenAI(
      base_url = 'https://api.groq.com/openai/v1',
      api_key = os.getenv('GROQ_API_KEY')
  )
  pinecone_api_key = os.getenv('PINECONE_API_KEY')
  pc = Pinecone(api_key=pinecone_api_key)
  pinecone_index = pc.Index('codebase-rag')


  query_embedding = get_hugging_face_embeddings(query)
  top_matches = pinecone_index.query(
      vector=query_embedding.tolist(),
      top_k = 5,
      include_metadata = True,
      namespace = pinecone_namespace
  )

  contexts = [item['metadata']['text'] for item in top_matches['matches']]
  augmented_query = "<CONTEXT>\n" + "\n\n------\n\n".join(contexts[:10])+ "\n-------\n</CONTEXT>\n\n\n MY QUESTION: \n" + query

  system_prompt = f"""
  You are an expert Software Engineer, with over 20 years of experience in Typescript.
  Answer any questions I have about the codebase, based on all the context provided.
  Always consider all of the context provided when forming a response.

  Let's think step by step.
  """

  llm_response = client.chat.completions.create(
      #what is llama good for?
      model ='llama-3.1-8b-instant',
      messages = [
          {'role':'system','content':system_prompt},
          {'role':'user','content':augmented_query}
      ]
  )

  response = llm_response.choices[0].message.content
  print(response)
  return response


def remove_pinecone_index(namespace):
  pinecone_api_key = os.getenv('PINECONE_API_KEY')
  pc = Pinecone(api_key=pinecone_api_key)
  index = pc.Index('codebase-rag')
  index.delete(delete_all=True, namespace=namespace)
  print('deleted index')

# repo_url = 'https://github.com/lalva224/SecureAgent_lalva224'
# repo_name = repo_url.split('/')[-1]
# current_directory = os.getcwd()
# clone_path_cwd = os.path.join(current_directory, repo_name)
# print(clone_path_cwd)
# print(get_main_files(clone_path_cwd))