diff --git a/search.py b/search.py new file mode 100755 index 0000000..3644ace --- /dev/null +++ b/search.py @@ -0,0 +1,371 @@ +# encoding: utf-8 +import sys +import subprocess +import re +import traceback +import os +import hashlib +from django.db import connection, transaction +import itertools +import time +from django.utils.encoding import smart_str, smart_unicode +def index_entire_history(file_path, page_id, *args, **kwargs): + print "indexing %s - %s" % (file_path.split('/')[2] , file_path.split("/")[5]) + try: + file_path = file_path.strip('/') + + # not a git repo + if not os.path.exists("%s/.git" % file_path): + return + + + command = "cd \"%s\" && git log --reverse --format=format:\"%%H\"" % (file_path) + lines = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).communicate()[0].splitlines() + except: + return (0, 1) + commit_count = 0 + fail_count = 0 + #index each commit starting from the earliest + for (i, line) in enumerate(lines): + try: + index_commit(file_path, page_id, commit=line) + + #progress bar lol + total_width = 60 + current_width = (i + 1) * total_width / len(lines) + sys.stdout.write("\r|" + ('-' * (current_width)).ljust(total_width) + "|" + "%d%%" % ((i + 1) * 100 / len(lines))) + + commit_count += 1 + except: + traceback.print_exc(file=sys.stdout) + fail_count = 1 + print "\nDone!" + return (commit_count, fail_count) + +#TODO: return total num of results, better ordering (tf-idf?), and COMMENT +def search(search_query, start): + results = [] + subqueries = get_valid_keywords(search_query) + con = connection.cursor() + subresults = [] + pages_hit = {} + for q in subqueries: + query = "\ + SELECT page,max(frequency * exp(-3*head)) as score FROM wiki_keyword where keyword = %s\ + GROUP BY page ORDER BY score DESC LIMIT 20 OFFSET %s\ + " + con.execute(query , (q, start)) + for (i, row) in enumerate(con.fetchall()): + page = row[0] + if page not in pages_hit: + pages_hit[page] = {"priority":1.0 + (10.0 - i) / 10, "words":[q], "id":page} + else: + pages_hit[page]["priority"] += 1.0 + (10.0 - i) / 10 + pages_hit[page]["words"].append(q) + pages = sorted(pages_hit.values(), key=lambda k: k["priority"], reverse=True)[:10] + commit_queries = [] + for page in pages: + commits_hit = {} + lines_hit = {} + for word in page["words"]: + con.execute("SELECT \"commit\" from wiki_keyword where page = %s and keyword = %s", (page["id"], word)) + for row in con.fetchall(): + if row[0] in commits_hit: + commits_hit[row[0]]["count"] += 1 + else: + commits_hit[row[0]] = {"commit": row[0], "count": 1} + con.execute("SELECT head_line_num from wiki_keywordlocation where head = 0 and page = %s and word = %s", (page["id"], word)) + for row in con.fetchall(): + if row[0] in lines_hit: + lines_hit[row[0]]["count"] += 1 + else: + lines_hit[row[0]] = {"line": row[0], "count": 1} + page["commits"] = sorted(commits_hit.values(), key=lambda k:k["count"], reverse=True) + page["lines"] = sorted(lines_hit.values(), key=lambda k:k["count"], reverse=True)[:3] + return pages + +@transaction.commit_manually() +def index_commit(file_path, page_id, *args, **kwargs): + start = time.clock() + #indexes the head by default, but can index past commits + commit = kwargs.get("commit", "HEAD") + + conn = kwargs.get("db", connection) + file_path = file_path.strip('/') + command = "cd \"%s\" && git show --format=format:\"%%H%%n%%ct\" %s" % (file_path , commit) + lines = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).communicate()[0].splitlines() + + + commit = None + date = None + current_hunk = None + + + words_changed = {} + words_to_add = [] + + #we don't really care about what word was deleted, all the words in the line are deleted + lines_deleted = [] + + # used to calculate the positions of words in the older commit now in the newest version, with added/removed lines + line_offset = [] + + #parse the diff to index the new commit + for (i, line) in enumerate(lines): + + #first two lines are the commit and the date respectively + if i == 0: + commit = line + if i == 1: + date = line + + #a hunk header, we're starting a hunk, parse the header + if line[0:2] == "@@": + # hunk header is in the format of + # @@ -\d,\d +\d,\d @@ line-content + hunk_pattern = re.compile(r"@@ -([\d]+),?([\d]*) \+([\d]+),?([\d]*) @@.*") + m = hunk_pattern.match(line) + current_hunk = {} + current_hunk["oldstart"] = int(m.group(1)) + + #sometimes the range of the hunk is omitted because it's 1 + if len(m.group(2)): + current_hunk["oldlen"] = int(m.group(2)) + else: + current_hunk["oldlen"] = 1 + current_hunk["newstart"] = int(m.group(3)) + if len(m.group(4)): + current_hunk["newlen"] = int(m.group(4)) + else: + current_hunk["newlen"] = 1 + + #0 lines read for this current hunk + current_hunk["oldread"] = 0 + current_hunk["newread"] = 0 + + #not inside a hunk, and not a hunk header so no need to parse + if not current_hunk or len(line) == 0: + continue + + #inserted line + if line[0] == '+': + + #line number in the current commit + line_num = current_hunk["newstart"] + current_hunk["newread"] + old_line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + #a start of a + block + #sometimes a blank line is put between every line of the hunk for some reason + prev_line = lines[i - 1] if len(lines[i - 1]) else lines[i - 2] + if prev_line[0] != line[0]: + line_offset.append({"line_num":old_line_num, "len": 1}) + #middle of a + block + else: + line_offset[-1]["len"] += 1 + + keywords = get_valid_keywords(line) + if len(keywords) == 0: + current_hunk["newread"] += 1 + continue + + for (word_pos, word) in enumerate(keywords): + hash = None + if word not in words_changed: + hash = "%d-%s-%s" % (page_id, hashlib.md5(word).hexdigest()[:20], commit[:10]) + + words_changed[word] = {"count":1, "hash":hash} + + else: + hash = words_changed[word]["hash"] + words_changed[word]["count"] += 1 + id = "%d-%d-%s-%d" % (page_id, word_pos, commit[:20], line_num) + words_to_add.append({"id":id, "word":word, "hash":hash, "line":line_num, "pos":word_pos}) + + current_hunk["newread"] += 1 + + #deleted line + elif line[0] == '-': + #line number in older version + line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + #a start of a - block + prev_line = lines[i - 1] if len(lines[i - 1]) else lines[i - 2] + if prev_line[0] != line[0]: + #has to add one because the line is deleted, so every line starting from the line after it will + #be shifted + line_offset.append({"line_num":line_num + 1, "len":-1}) + #middle of a - block + else: + line_offset[-1]["len"] -= 1 + line_offset[-1]["line_num"] = line_num + 1 + + keywords = get_valid_keywords(line) + if len(keywords) == 0: + current_hunk["oldread"] += 1 + continue + + for (word_pos, word) in enumerate(keywords): + hash = None + if word not in words_changed: + hash = "%d-%s-%s" % (page_id, hashlib.md5(word).hexdigest()[:20], commit[:10]) + + words_changed[word] = {"count":-1, "hash":hash} + else: + words_changed[word]["count"] -= 1 + + lines_deleted.append(line_num) + + + current_hunk["oldread"] += 1 + + #unchanged line + elif line[0] == ' ': + line_num = current_hunk["newstart"] + current_hunk["newread"] + old_line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + current_hunk["newread"] += 1 + current_hunk["oldread"] += 1 + + def split_seq(iterable, size): + it = iter(iterable) + item = list(itertools.islice(it, size)) + while item: + yield item + item = list(itertools.islice(it, size)) + + cursor = conn.cursor() + + try: + #add the changes to the word base in this commit, no positional information yet + insert_commit_changes(words_changed, page_id, commit, date, cursor) + for chunk in split_seq(lines_deleted, 100): + #the words in the deleted lines are no longer in the head + unflag_deleted(chunk, page_id, cursor) + #the position of the words in the unmodified lines is pushed to different positions + shift_older_lines(line_offset, page_id, cursor) + except: + transaction.rollback() + else: + #have to commit here so the new keywords don't collide with the old ones(which are shifted from the commit) + transaction.commit() + try: + #finally add the new words to the index + add_new_words(words_to_add, page_id, cursor) + except: + transaction.rollback() + else: + transaction.commit() + +def add_new_words(words, page, cursor): + values = [] + for word in words: + value = (word["id"], word["hash"], word["word"], page, word["line"], word["pos"], word["line"]) + values.append(value) + query = "INSERT INTO wiki_keywordlocation values (%s,%s,%s,%s,%s,%s,0,%s,0)" + try: + cursor.executemany(query, values) + except: + traceback.print_exc(file=sys.stdout) + raise Exception + +def shift_older_lines(offsets, page, cursor): + highest_affected = sys.maxint + #have to use temp because we're want to modify head_line_num based on the old value of head_line_num + cursor.execute("UPDATE wiki_keywordlocation SET temp = head_line_num WHERE page = %d and head = 0" % (page)) + for offset in offsets: + loc = offset["line_num"] + if loc < highest_affected: + highest_affected = loc + len = offset["len"] + query = "UPDATE wiki_keywordlocation SET temp = temp + %d WHERE page = %d and head_line_num >= %d and head = 0" % (len, page, loc) + try: + cursor.execute(query) + except: + + traceback.print_exc(file=sys.stdout) + raise Exception + cursor.execute("UPDATE wiki_keywordlocation SET head_line_num = temp WHERE page = %d and head_line_num >= %d and head = 0" % (page, highest_affected)) + +def unflag_deleted(lines, page, cursor): + query = "UPDATE wiki_keywordlocation SET head = head +1 WHERE page = " + str(page) + " and head_line_num = %s" + try: + cursor.executemany(query, [(k,) for k in lines]) + except: + print query + traceback.print_exc(file=sys.stdout) + raise Exception + + +def insert_commit_changes(words, page, commit, date, cursor): + start = time.clock() + values = [] + query = u"UPDATE wiki_keyword SET head = head + 1 WHERE page = " + str(page) + " and keyword = %s" + try: + cursor.executemany(query, [(k,) for k in words.keys()]) + except: + print query + traceback.print_exc(file=sys.stdout) + raise Exception + for word in words.keys(): + frequency = words[word]["count"] + value = (words[word]["hash"], page, word, commit, word, page, frequency, frequency, date) + values.append(value) + + query = u"INSERT INTO wiki_keyword values(%s,%s,%s,%s,COALESCE((SELECT frequency FROM wiki_keyword WHERE keyword = %s and page = %s and head = 1)+%s,%s),%s,0)" + try: + cursor.executemany(query, values) + except: + traceback.print_exc(file=sys.stdout) + raise Exception + + +ignore_words = ["the", "be", "to", "of", "and", "a", + "in", "that", "have", "I", "it", "for", + "not", "on", "with", "he", "as", "you", "why", + "do", "at", "this", "but", "his", "by", "where", + "from", "they", "we", "say", "her", "she", "name", + "or", "an", "will", "my", "one", "all", + "would", "there", "their", "what", "so", + "up", "out", "if", "about", "who", "get", + "which", "go", "me", "when", "make", "can", + "like", "time", "no", "just", "him", "know", + "take", "person", "into", "year", "your", + "good", "some", "could", "them", "see", "number", + "other", "than", "then", "now", "look", "only", + "come", "its", "over", "think", "also", "back", "has", + "after", "use", "two", "how", "our", "work", "usually", + "first", "well", "way", "even", "new", "want", "ago", "com", + "because", "any", "these", "give", "day", "are", "dont", "don" + "most", "us", "toc", "iff", "neg", "let", "lor"] + +def get_valid_keywords(line): + line = smart_str(line) + #split on delimiters + delim = re.compile(r"[\.,:;\[\]\(\)\-_\\/=\+\}\{><\|]") + line = delim.sub(" ", line) + + #strip random punctuations and quotes + punct = re.compile(r"[\*#\\?@`~!\$%\^&\"\']") + line = punct.sub(" ", line) + + + #split camel case words into individual words + #pattern = re.compile('([A-Z][A-Z][a-z])|([a-z][A-Z])') + #with_camel = pattern.sub(lambda m: m.group()[:1] + " " + m.group()[1:], with_spacing) + + keywords = line.split() + + + #eliminate words shorter than 3 letters, and frequently used words + def eliminate(word): + return word.lower() not in ignore_words and len(word) > 2 + keywords = map(lambda k:k.lower(), filter(eliminate, keywords)) + return keywords + +if len(sys.argv) > 2: + if sys.argv[1] == "index": + index_commit(sys.argv[2], sys.argv[3]) + + if sys.argv[1] == "search": + search(sys.argv[2]) + diff --git a/templates/search/results.html b/templates/search/results.html index 4e21842..a39daa6 100644 --- a/templates/search/results.html +++ b/templates/search/results.html @@ -3,8 +3,33 @@ {% block content %}
The search feature is still under construction. Got a suggestion for how search should behave? Let us know!
+ + Search completed in {{time}} seconds +