From 9c25cf586cfc55e8c7708531fc6a0387c60427b1 Mon Sep 17 00:00:00 2001 From: yzhou Date: Tue, 24 Jan 2012 01:37:31 -0500 Subject: [PATCH 1/2] Search working almost perfectly with postgres/sqlite. Missing pagination and the search page is not very pretty at all. Speed is also something to be desired --- search.py | 370 +++++++++++++++++++++++++ templates/search/results.html | 29 +- views/main.py | 51 +++- views/pages.py | 6 +- wiki/management/commands/buildindex.py | 23 ++ wiki/models/searchindex.py | 39 +++ 6 files changed, 512 insertions(+), 6 deletions(-) create mode 100755 search.py create mode 100755 wiki/management/commands/buildindex.py create mode 100755 wiki/models/searchindex.py diff --git a/search.py b/search.py new file mode 100755 index 0000000..67273c5 --- /dev/null +++ b/search.py @@ -0,0 +1,370 @@ +import sys +import subprocess +import re +import traceback +import os +import hashlib +from django.db import connection, transaction +import itertools +import time + +def index_entire_history(file_path, page_id, *args, **kwargs): + print "indexing %s - %s" % (file_path.split('/')[2] , file_path.split("/")[5]) + try: + file_path = file_path.strip('/') + + # not a git repo + if not os.path.exists("%s/.git" % file_path): + return + + + command = "cd \"%s\" && git log --reverse --format=format:\"%%H\"" % (file_path) + lines = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).communicate()[0].splitlines() + except: + return (0, 1) + commit_count = 0 + fail_count = 0 + #index each commit starting from the earliest + for (i, line) in enumerate(lines): + try: + index_commit(file_path, page_id, commit=line) + + #progress bar lol + total_width = 60 + current_width = (i + 1) * total_width / len(lines) + sys.stdout.write("\r|" + ('-' * (current_width)).ljust(total_width) + "|" + "%d%%" % ((i + 1) * 100 / len(lines))) + + commit_count += 1 + except: + traceback.print_exc(file=sys.stdout) + fail_count = 1 + print "\nDone!" + return (commit_count, fail_count) + +#TODO: return total num of results, better ordering (tf-idf?), and COMMENT +def search(search_query, start): + results = [] + subqueries = get_valid_keywords(search_query) + con = connection.cursor() + subresults = [] + pages_hit = {} + for q in subqueries: + query = "\ + SELECT page,max(frequency * exp(-3*head)) as score FROM wiki_keyword where keyword = %s\ + GROUP BY page ORDER BY score DESC LIMIT 20 OFFSET %s\ + " + con.execute(query , (q, start)) + for (i, row) in enumerate(con.fetchall()): + page = row[0] + if page not in pages_hit: + pages_hit[page] = {"priority":1.0 + (10.0 - i) / 10, "words":[q], "id":page} + else: + pages_hit[page]["priority"] += 1.0 + (10.0 - i) / 10 + pages_hit[page]["words"].append(q) + pages = sorted(pages_hit.values(), key=lambda k: k["priority"], reverse=True)[:10] + commit_queries = [] + for page in pages: + commits_hit = {} + lines_hit = {} + for word in page["words"]: + con.execute("SELECT \"commit\" from wiki_keyword where page = %s and keyword = %s", (page["id"], word)) + for row in con.fetchall(): + if row[0] in commits_hit: + commits_hit[row[0]]["count"] += 1 + else: + commits_hit[row[0]] = {"commit": row[0], "count": 1} + con.execute("SELECT head_line_num from wiki_keywordlocation where head = 0 and page = %s and word = %s", (page["id"], word)) + for row in con.fetchall(): + if row[0] in lines_hit: + lines_hit[row[0]]["count"] += 1 + else: + lines_hit[row[0]] = {"line": row[0], "count": 1} + page["commits"] = sorted(commits_hit.values(), key=lambda k:k["count"], reverse=True) + page["lines"] = sorted(lines_hit.values(), key=lambda k:k["count"], reverse=True)[:3] + return pages + +@transaction.commit_manually() +def index_commit(file_path, page_id, *args, **kwargs): + start = time.clock() + #indexes the head by default, but can index past commits + commit = kwargs.get("commit", "HEAD") + + conn = kwargs.get("db", connection) + file_path = file_path.strip('/') + command = "cd \"%s\" && git show --format=format:\"%%H%%n%%ct\" %s" % (file_path , commit) + lines = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).communicate()[0].splitlines() + + + commit = None + date = None + current_hunk = None + + + words_changed = {} + words_to_add = [] + + #we don't really care about what word was deleted, all the words in the line are deleted + lines_deleted = [] + + # used to calculate the positions of words in the older commit now in the newest version, with added/removed lines + line_offset = [] + + #parse the diff to index the new commit + for (i, line) in enumerate(lines): + + #first two lines are the commit and the date respectively + if i == 0: + commit = line + if i == 1: + date = line + + #a hunk header, we're starting a hunk, parse the header + if line[0:2] == "@@": + # hunk header is in the format of + # @@ -\d,\d +\d,\d @@ line-content + hunk_pattern = re.compile(r"@@ -([\d]+),?([\d]*) \+([\d]+),?([\d]*) @@.*") + m = hunk_pattern.match(line) + current_hunk = {} + current_hunk["oldstart"] = int(m.group(1)) + + #sometimes the range of the hunk is omitted because it's 1 + if len(m.group(2)): + current_hunk["oldlen"] = int(m.group(2)) + else: + current_hunk["oldlen"] = 1 + current_hunk["newstart"] = int(m.group(3)) + if len(m.group(4)): + current_hunk["newlen"] = int(m.group(4)) + else: + current_hunk["newlen"] = 1 + + #0 lines read for this current hunk + current_hunk["oldread"] = 0 + current_hunk["newread"] = 0 + + #not inside a hunk, and not a hunk header so no need to parse + if not current_hunk or len(line) == 0: + continue + + #inserted line + if line[0] == '+': + + #line number in the current commit + line_num = current_hunk["newstart"] + current_hunk["newread"] + old_line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + #a start of a + block + #sometimes a blank line is put between every line of the hunk for some reason + prev_line = lines[i - 1] if len(lines[i - 1]) else lines[i - 2] + if prev_line[0] != line[0]: + line_offset.append({"line_num":old_line_num, "len": 1}) + #middle of a + block + else: + line_offset[-1]["len"] += 1 + + keywords = get_valid_keywords(line) + if len(keywords) == 0: + current_hunk["newread"] += 1 + continue + + for (word_pos, word) in enumerate(keywords): + hash = None + if word not in words_changed: + hash = "%d-%s-%s" % (page_id, hashlib.md5(word).hexdigest()[:20], commit[:10]) + + words_changed[word] = {"count":1, "hash":hash} + + else: + hash = words_changed[word]["hash"] + words_changed[word]["count"] += 1 + id = "%d-%d-%s-%d" % (page_id, word_pos, commit[:20], line_num) + words_to_add.append({"id":id, "word":word, "hash":hash, "line":line_num, "pos":word_pos}) + + current_hunk["newread"] += 1 + + #deleted line + elif line[0] == '-': + #line number in older version + line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + #a start of a - block + prev_line = lines[i - 1] if len(lines[i - 1]) else lines[i - 2] + if prev_line[0] != line[0]: + #has to add one because the line is deleted, so every line starting from the line after it will + #be shifted + line_offset.append({"line_num":line_num + 1, "len":-1}) + #middle of a - block + else: + line_offset[-1]["len"] -= 1 + line_offset[-1]["line_num"] = line_num + 1 + + keywords = get_valid_keywords(line) + if len(keywords) == 0: + current_hunk["oldread"] += 1 + continue + + for (word_pos, word) in enumerate(keywords): + hash = None + if word not in words_changed: + hash = "%d-%s-%s" % (page_id, hashlib.md5(word).hexdigest()[:20], commit[:10]) + + words_changed[word] = {"count":-1, "hash":hash} + else: + words_changed[word]["count"] -= 1 + + lines_deleted.append(line_num) + + + current_hunk["oldread"] += 1 + + #unchanged line + elif line[0] == ' ': + line_num = current_hunk["newstart"] + current_hunk["newread"] + old_line_num = current_hunk["oldstart"] + current_hunk["oldread"] + + current_hunk["newread"] += 1 + current_hunk["oldread"] += 1 + + def split_seq(iterable, size): + it = iter(iterable) + item = list(itertools.islice(it, size)) + while item: + yield item + item = list(itertools.islice(it, size)) + + cursor = conn.cursor() + + try: + #add the changes to the word base in this commit, no positional information yet + insert_commit_changes(words_changed, page_id, commit, date, cursor) + for chunk in split_seq(lines_deleted, 100): + #the words in the deleted lines are no longer in the head + unflag_deleted(chunk, page_id, cursor) + #the position of the words in the unmodified lines is pushed to different positions + shift_older_lines(line_offset, page_id, cursor) + except: + transaction.rollback() + else: + #have to commit here so the new keywords don't collide with the old ones(which are shifted from the commit) + transaction.commit() + try: + #finally add the new words to the index + add_new_words(words_to_add, page_id, cursor) + except: + transaction.rollback() + else: + transaction.commit() + +def add_new_words(words, page, cursor): + values = [] + for word in words: + value = (word["id"], word["hash"], word["word"], page, word["line"], word["pos"], word["line"]) + values.append(value) + query = "INSERT INTO wiki_keywordlocation values (%s,%s,%s,%s,%s,%s,0,%s,0)" + try: + cursor.executemany(query, values) + except: + traceback.print_exc(file=sys.stdout) + raise Exception + +def shift_older_lines(offsets, page, cursor): + highest_affected = sys.maxint + #have to use temp because we're want to modify head_line_num based on the old value of head_line_num + cursor.execute("UPDATE wiki_keywordlocation SET temp = head_line_num WHERE page = %d and head = 0" % (page)) + for offset in offsets: + loc = offset["line_num"] + if loc < highest_affected: + highest_affected = loc + len = offset["len"] + query = "UPDATE wiki_keywordlocation SET temp = temp + %d WHERE page = %d and head_line_num >= %d and head = 0" % (len, page, loc) + try: + cursor.execute(query) + except: + + traceback.print_exc(file=sys.stdout) + raise Exception + cursor.execute("UPDATE wiki_keywordlocation SET head_line_num = temp WHERE page = %d and head_line_num >= %d and head = 0" % (page, highest_affected)) + +def unflag_deleted(lines, page, cursor): + query = "UPDATE wiki_keywordlocation SET head = head +1 WHERE page = " + str(page) + " and head_line_num = %s" + try: + cursor.executemany(query, [(k,) for k in lines]) + except: + print query + traceback.print_exc(file=sys.stdout) + raise Exception + + +def insert_commit_changes(words, page, commit, date, cursor): + start = time.clock() + values = [] + query = u"UPDATE wiki_keyword SET head = head + 1 WHERE page = " + str(page) + " and keyword = %s" + try: + cursor.executemany(query, [(k,) for k in words.keys()]) + except: + print query + traceback.print_exc(file=sys.stdout) + raise Exception + for word in words.keys(): + frequency = words[word]["count"] + value = (words[word]["hash"], page, word, commit, word, page, frequency, frequency, date) + values.append(value) + + query = u"INSERT INTO wiki_keyword values(%s,%s,%s,%s,COALESCE((SELECT frequency FROM wiki_keyword WHERE keyword = %s and page = %s and head = 1)+%s,%s),%s,0)" + try: + cursor.executemany(query, values) + except: + traceback.print_exc(file=sys.stdout) + raise Exception + + +ignore_words = ["the", "be", "to", "of", "and", "a", + "in", "that", "have", "I", "it", "for", + "not", "on", "with", "he", "as", "you", "why", + "do", "at", "this", "but", "his", "by", "where", + "from", "they", "we", "say", "her", "she", "name", + "or", "an", "will", "my", "one", "all", + "would", "there", "their", "what", "so", + "up", "out", "if", "about", "who", "get", + "which", "go", "me", "when", "make", "can", + "like", "time", "no", "just", "him", "know", + "take", "person", "into", "year", "your", + "good", "some", "could", "them", "see", "number", + "other", "than", "then", "now", "look", "only", + "come", "its", "over", "think", "also", "back", "has", + "after", "use", "two", "how", "our", "work", "usually", + "first", "well", "way", "even", "new", "want", "ago", "com", + "because", "any", "these", "give", "day", "are", "dont", "don" + "most", "us", "toc", "iff", "neg", "let", "lor"] + +def get_valid_keywords(line): + line = unicode(line) + #split on delimiters + delim = re.compile(r"[\.,:;\[\]\(\)\-_\\/=\+\}\{><\|]") + line = delim.sub(" ", line) + + #strip random punctuations and quotes + punct = re.compile(r"[\*#\\?@`~!\$%\^&\"\']") + line = punct.sub(" ", line) + + + #split camel case words into individual words + #pattern = re.compile('([A-Z][A-Z][a-z])|([a-z][A-Z])') + #with_camel = pattern.sub(lambda m: m.group()[:1] + " " + m.group()[1:], with_spacing) + + keywords = line.split() + + + #eliminate words shorter than 3 letters, and frequently used words + def eliminate(word): + return word.lower() not in ignore_words and len(word) > 2 + keywords = map(lambda k:k.lower(), filter(eliminate, keywords)) + return keywords + +if len(sys.argv) > 2: + if sys.argv[1] == "index": + index_commit(sys.argv[2], sys.argv[3]) + + if sys.argv[1] == "search": + search(sys.argv[2]) + diff --git a/templates/search/results.html b/templates/search/results.html index 4e21842..a39daa6 100644 --- a/templates/search/results.html +++ b/templates/search/results.html @@ -3,8 +3,33 @@ {% block content %}
-

Search results for {{ query }}

-

The search feature is still under construction. Got a suggestion for how search should behave? Let us know!

+
+ + +
+ Search completed in {{time}} seconds +
+
+
+ {% for result in results %} +
+

{{result.page_title}}

+
+ {{result.preview_text|safe}} +
+
Mentioned in commits + {% for commit in result.commits_mentioned%} + {{commit.name}}, + {% endfor %} + {% if result.commits_hidden > 0 %} + and {{result.commits_hidden}} other{{result.commits_hidden|pluralize}} + {% endif %} +
+
+
+
+ {% endfor %} +
{% endblock %} diff --git a/views/main.py b/views/main.py index aef43cd..92eeb71 100755 --- a/views/main.py +++ b/views/main.py @@ -6,9 +6,14 @@ from wiki.utils.users import validate_username from wiki.models.pages import Page from blog.models import BlogPost +from wiki.models.searchindex import Result from django.http import Http404 from urls import static_urls import os +import re +from time import clock +from search import search as search_module +from django.utils.html import escape # welcome is only set to true when called from register() # Triggers the display of some sort of welcome message @@ -193,9 +198,51 @@ def markdown(request): def search(request): if 'query' in request.GET: + start = clock() + results = search_module(request.GET["query"], 0) + parsed_results = [] + for result in results: + presult = Result() + page = Page.objects.get(pk=result["id"]) + presult.page_url = page.get_absolute_url() + presult.page_title = presult.page_url.replace("_", " ").strip("/").replace("/", " / ") + line_nums = [k["line"] for k in result["lines"]] + content = page.load_content().splitlines() + content_length = len(content) + preview_line_nums = [] + if len(line_nums): + preview_line_nums = line_nums[:2] + else: + preview_line_nums = [0] + + # get the surrounding lines for context + preview_lines = map(lambda k: escape(content[k - 1]), preview_line_nums) + preview = " ...
... ".join(preview_lines) + + #not working perfectly, suppose to contract long paragraphs to only show the parts containing the words + """ + if len(preview) > 200: + seg_length = 200 / len(result['words']) + p = re.compile("(([^ ]* (.){0,%d})(%s)(.{0,%d} [^ ]*) +)+" % (seg_length, "|".join(result['words']), seg_length), re.IGNORECASE) + shortened = [] + for match in p.finditer(preview): + shortened.append(match.group(0)) + preview = "..." + " ... ".join(shortened) + " ..." + """ + # highlight found words + p = re.compile("|".join(result['words']), re.IGNORECASE) + preview = p.sub(lambda m: "%s" % m.group(0), preview) + + presult.preview_text = preview + + for commit in result["commits"][:3]: + presult.commits_mentioned.append({"name":commit["commit"][:10], "url":(presult.page_url + "/commit/" + commit["commit"])}) + presult.commits_hidden = len(result["commits"]) - 3 if len(result["commits"]) > 3 else 0 + parsed_results.append(presult) data = { - 'title': 'Search results', - 'query': request.GET['query'] + 'query': request.GET['query'], + 'results':parsed_results, + 'time':"%0.4f" % (clock() - start) } return render(request, 'search/results.html', data) else: diff --git a/views/pages.py b/views/pages.py index 6e74592..9317fc5 100644 --- a/views/pages.py +++ b/views/pages.py @@ -12,6 +12,7 @@ from wiki.utils.currents import current_term, current_year from views.main import register from datetime import datetime +from search import index_commit def show(request, department, number, page_type, term, year, slug): course = get_object_or_404(Course, department=department, number=int(number)) @@ -104,6 +105,7 @@ def edit(request, department, number, page_type, term, year, slug): username = request.user.username message = request.POST['message'] page.save_content(request.POST['content'], message, username) + index_commit(page.get_filepath(), page.pk) data = { 'course': course, 'page': page, @@ -173,7 +175,7 @@ def create(request, department, number, page_type): except ValueError: pass # defaults to the current year data['current_exam_type'] = request.POST['exam_type'] if 'exam_type' in request.POST else '' - data['subject'] = request.POST['subject'] if 'subject' in request.POST else '' + data['subject'] = request.POST['subject'] if 'subject' in request.POST else '' data['content'] = request.POST['content'] data['message'] = request.POST['message'] @@ -187,7 +189,7 @@ def create(request, department, number, page_type): username = request.user.username email = request.user.email new_page.save_content(request.POST['content'], commit_message, username) - + index_commit(new_page.get_filepath(), new_page.pk) # Add the history item - should be done automatically one day course.add_event(page=new_page, user=request.user, action='created', message=commit_message) data['page'] = new_page diff --git a/wiki/management/commands/buildindex.py b/wiki/management/commands/buildindex.py new file mode 100755 index 0000000..61916a7 --- /dev/null +++ b/wiki/management/commands/buildindex.py @@ -0,0 +1,23 @@ +from django.core.management.base import BaseCommand, CommandError +from django.db import connection +from search import index_entire_history +from wiki.models.pages import Page +import time +class Command(BaseCommand): + def handle(self, *args, **options): + print "Clearing indices" + cursor = connection.cursor() + cursor.execute("DELETE FROM wiki_keyword") + cursor.execute("DELETE FROM wiki_keywordlocation") + print "Starting to index the pages" + start = time.clock() + success = 0 + fail = 0 + pages = Page.objects.all() + for page in pages: + (yay, no) = index_entire_history(page.get_filepath(), page.pk) + success += yay + fail += no + print "%d failures" % fail + print "Successfully indexed %d commits in %d files in %s seconds" % (success, len(pages), time.clock() - start) + diff --git a/wiki/models/searchindex.py b/wiki/models/searchindex.py new file mode 100755 index 0000000..88d8a64 --- /dev/null +++ b/wiki/models/searchindex.py @@ -0,0 +1,39 @@ +from django.db import models +from django.db.models import fields + +class Keyword(models.Model): + class Meta: + app_label = 'wiki' + unique_together = (("keyword", "commit", "page")) + hash = models.CharField(max_length=40, primary_key=True) + page = models.IntegerField() + keyword = models.CharField(max_length=120) #reasonable length for longest word + commit = models.CharField(max_length=100) + frequency = models.IntegerField(default=0) + date = models.IntegerField() + head = models.IntegerField() + +class KeywordLocation(models.Model): + class Meta: + app_label = 'wiki' + + k_id = models.CharField(max_length=32, primary_key=True) + keyword_hash = models.CharField(max_length=40) + word = models.CharField(max_length=120) + page = models.IntegerField() + line_num = models.IntegerField() + pos = models.IntegerField() + head = models.IntegerField() + head_line_num = models.IntegerField(null=True) + temp = models.IntegerField(null=True) + +#not actually a model +class Result(): + + def __init__(self): + self.page = None + self.preview_text = "" + self.commits_mentioned = [] + self.terms = [] + self.commits_hidden = 0 + From 3177bd3ac4640a703cf42cc5012c95e3a60daa19 Mon Sep 17 00:00:00 2001 From: hc5 Date: Thu, 26 Jan 2012 14:31:06 -0500 Subject: [PATCH 2/2] minor unicode bug fix --- search.py | 5 +++-- wiki/management/__init__.py | 0 wiki/management/commands/__init__.py | 0 wiki/models/__init__.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) create mode 100755 wiki/management/__init__.py create mode 100755 wiki/management/commands/__init__.py diff --git a/search.py b/search.py index 67273c5..3644ace 100755 --- a/search.py +++ b/search.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import sys import subprocess import re @@ -7,7 +8,7 @@ from django.db import connection, transaction import itertools import time - +from django.utils.encoding import smart_str, smart_unicode def index_entire_history(file_path, page_id, *args, **kwargs): print "indexing %s - %s" % (file_path.split('/')[2] , file_path.split("/")[5]) try: @@ -338,7 +339,7 @@ def insert_commit_changes(words, page, commit, date, cursor): "most", "us", "toc", "iff", "neg", "let", "lor"] def get_valid_keywords(line): - line = unicode(line) + line = smart_str(line) #split on delimiters delim = re.compile(r"[\.,:;\[\]\(\)\-_\\/=\+\}\{><\|]") line = delim.sub(" ", line) diff --git a/wiki/management/__init__.py b/wiki/management/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/wiki/management/commands/__init__.py b/wiki/management/commands/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/wiki/models/__init__.py b/wiki/models/__init__.py index 097f6d5..70cf0bc 100755 --- a/wiki/models/__init__.py +++ b/wiki/models/__init__.py @@ -6,3 +6,4 @@ from history import * from users import * from series import * +from searchindex import * \ No newline at end of file