go-reason/snippet.go at main · bbiangul/go-reason · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package goreason

import (
	"strings"
	"unicode"
)

// snippetMaxLen is the approximate maximum character length for a snippet.
const snippetMaxLen = 300

// extractSnippet returns the 1-2 most relevant sentences from content based on
// word overlap with answerWords. Returns empty string if no good match found.
func extractSnippet(content string, answerWords map[string]bool) string {
	if len(answerWords) == 0 || content == "" {
		return ""
	}

	sentences := snippetSplitSentences(content)
	if len(sentences) == 0 {
		return ""
	}

	// Score each sentence by overlap with answer words.
	type scored struct {
		text  string
		score int
		index int
	}
	scoredSentences := make([]scored, len(sentences))
	for i, s := range sentences {
		words := significantWords(s)
		overlap := 0
		for w := range words {
			if answerWords[w] {
				overlap++
			}
		}
		scoredSentences[i] = scored{text: s, score: overlap, index: i}
	}

	// Find the best sentence.
	bestIdx := 0
	bestScore := scoredSentences[0].score
	for i, s := range scoredSentences {
		if s.score > bestScore {
			bestScore = s.score
			bestIdx = i
		}
	}

	if bestScore == 0 {
		return ""
	}

	result := scoredSentences[bestIdx].text

	// Try to add the next-best adjacent sentence if it fits within the limit.
	if len(result) < snippetMaxLen && len(scoredSentences) > 1 {
		// Prefer the adjacent sentence (next or previous) with the highest score.
		candidateIdx := -1
		candidateScore := 0
		for _, delta := range []int{1, -1} {
			adj := bestIdx + delta
			if adj >= 0 && adj < len(scoredSentences) && scoredSentences[adj].score > candidateScore {
				candidateScore = scoredSentences[adj].score
				candidateIdx = adj
			}
		}
		if candidateIdx >= 0 && candidateScore > 0 {
			combined := result + " " + scoredSentences[candidateIdx].text
			if candidateIdx < bestIdx {
				combined = scoredSentences[candidateIdx].text + " " + result
			}
			if len(combined) <= snippetMaxLen {
				result = combined
			}
		}
	}

	return result
}

// significantWords returns the set of lowercased words >= 4 characters,
// excluding common stop words.
func significantWords(text string) map[string]bool {
	words := make(map[string]bool)
	for _, w := range strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
		return !unicode.IsLetter(r) && !unicode.IsDigit(r)
	}) {
		if len(w) >= 4 && !stopWords[w] {
			words[w] = true
		}
	}
	return words
}

// snippetSplitSentences splits text into sentences at period/question/exclamation
// boundaries followed by whitespace or end of string.
func snippetSplitSentences(text string) []string {
	var sentences []string
	var cur strings.Builder

	runes := []rune(text)
	for i := 0; i < len(runes); i++ {
		cur.WriteRune(runes[i])
		if runes[i] == '.' || runes[i] == '?' || runes[i] == '!' {
			if i+1 >= len(runes) || runes[i+1] == ' ' || runes[i+1] == '\n' || runes[i+1] == '\t' {
				s := strings.TrimSpace(cur.String())
				if s != "" {
					sentences = append(sentences, s)
				}
				cur.Reset()
			}
		}
	}
	if cur.Len() > 0 {
		s := strings.TrimSpace(cur.String())
		if s != "" {
			sentences = append(sentences, s)
		}
	}
	return sentences
}

// stopWords is a set of common English stop words to exclude from matching.
var stopWords = map[string]bool{
	"that": true, "this": true, "with": true, "from": true,
	"have": true, "been": true, "were": true, "they": true,
	"their": true, "will": true, "would": true, "could": true,
	"should": true, "about": true, "which": true, "there": true,
	"these": true, "those": true, "then": true, "than": true,
	"them": true, "what": true, "when": true, "where": true,
	"your": true, "more": true, "some": true, "such": true,
	"only": true, "also": true, "very": true, "just": true,
	"into": true, "over": true, "each": true, "does": true,
	"most": true, "after": true, "before": true, "other": true,
	"being": true, "same": true, "both": true, "between": true,
}