tfidf/main.go at main · JacobMcKenzieSmarty/tfidf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package main

import (
	"fmt"

	"tfidf/data"
	"tfidf/pipeline"
)

func main() {
	// Document Corpus
	//docs := []model.Document{
	//	//{0, "apple orange banana"},
	//	//{1, "banana apple"},
	//	//{2, "computer science and data"},
	//	{0, "1848 N 680 W OREM"},
	//	{1, "3898 W 5535 S OREM"},
	//	{2, "3845 W 5400 S SANDY"},
	//}

	docs, err := data.LoadDocumentsFromCSV("data/corpus.csv")
	if err != nil {
		panic(err)
	}

	//Step 1: Preprocessing Data
	vocab, docTFs, df := pipeline.BuildVocabAndTermFrequenciesAndDocumentFrequency(docs)
	idf := pipeline.ComputeIDF(df, len(docs))
	docVecs := pipeline.BuildTFIDFVectors(docTFs, idf)

	//Step 2: Indexing (Inverted-Index)
	invertedIndex := pipeline.MakeInvertedIndex(docTFs)

	// Step 3: Query Processing
	//query := "3845 S"
	query := "space shuttle orbit"
	queryVec, candidates := pipeline.BuildQueryTFIDFVector(query, vocab, idf, invertedIndex)

	// Step 4: Scoring
	scores := pipeline.ScoreDocuments(queryVec, docVecs, candidates)

	// Step 5: Returning Results
	for i, score := range scores {
		fmt.Printf("Rank %d: Doc %d (score: %.4f) : %s :   %s\n", i+1, score.DocID, score.Value, docs[score.DocID].Text, docs[score.DocID].Category)
	}

}