-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsnippet.go
More file actions
138 lines (124 loc) · 3.81 KB
/
snippet.go
File metadata and controls
138 lines (124 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package goreason
import (
"strings"
"unicode"
)
// snippetMaxLen is the approximate maximum character length for a snippet.
const snippetMaxLen = 300
// extractSnippet returns the 1-2 most relevant sentences from content based on
// word overlap with answerWords. Returns empty string if no good match found.
func extractSnippet(content string, answerWords map[string]bool) string {
if len(answerWords) == 0 || content == "" {
return ""
}
sentences := snippetSplitSentences(content)
if len(sentences) == 0 {
return ""
}
// Score each sentence by overlap with answer words.
type scored struct {
text string
score int
index int
}
scoredSentences := make([]scored, len(sentences))
for i, s := range sentences {
words := significantWords(s)
overlap := 0
for w := range words {
if answerWords[w] {
overlap++
}
}
scoredSentences[i] = scored{text: s, score: overlap, index: i}
}
// Find the best sentence.
bestIdx := 0
bestScore := scoredSentences[0].score
for i, s := range scoredSentences {
if s.score > bestScore {
bestScore = s.score
bestIdx = i
}
}
if bestScore == 0 {
return ""
}
result := scoredSentences[bestIdx].text
// Try to add the next-best adjacent sentence if it fits within the limit.
if len(result) < snippetMaxLen && len(scoredSentences) > 1 {
// Prefer the adjacent sentence (next or previous) with the highest score.
candidateIdx := -1
candidateScore := 0
for _, delta := range []int{1, -1} {
adj := bestIdx + delta
if adj >= 0 && adj < len(scoredSentences) && scoredSentences[adj].score > candidateScore {
candidateScore = scoredSentences[adj].score
candidateIdx = adj
}
}
if candidateIdx >= 0 && candidateScore > 0 {
combined := result + " " + scoredSentences[candidateIdx].text
if candidateIdx < bestIdx {
combined = scoredSentences[candidateIdx].text + " " + result
}
if len(combined) <= snippetMaxLen {
result = combined
}
}
}
return result
}
// significantWords returns the set of lowercased words >= 4 characters,
// excluding common stop words.
func significantWords(text string) map[string]bool {
words := make(map[string]bool)
for _, w := range strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
}) {
if len(w) >= 4 && !stopWords[w] {
words[w] = true
}
}
return words
}
// snippetSplitSentences splits text into sentences at period/question/exclamation
// boundaries followed by whitespace or end of string.
func snippetSplitSentences(text string) []string {
var sentences []string
var cur strings.Builder
runes := []rune(text)
for i := 0; i < len(runes); i++ {
cur.WriteRune(runes[i])
if runes[i] == '.' || runes[i] == '?' || runes[i] == '!' {
if i+1 >= len(runes) || runes[i+1] == ' ' || runes[i+1] == '\n' || runes[i+1] == '\t' {
s := strings.TrimSpace(cur.String())
if s != "" {
sentences = append(sentences, s)
}
cur.Reset()
}
}
}
if cur.Len() > 0 {
s := strings.TrimSpace(cur.String())
if s != "" {
sentences = append(sentences, s)
}
}
return sentences
}
// stopWords is a set of common English stop words to exclude from matching.
var stopWords = map[string]bool{
"that": true, "this": true, "with": true, "from": true,
"have": true, "been": true, "were": true, "they": true,
"their": true, "will": true, "would": true, "could": true,
"should": true, "about": true, "which": true, "there": true,
"these": true, "those": true, "then": true, "than": true,
"them": true, "what": true, "when": true, "where": true,
"your": true, "more": true, "some": true, "such": true,
"only": true, "also": true, "very": true, "just": true,
"into": true, "over": true, "each": true, "does": true,
"most": true, "after": true, "before": true, "other": true,
"being": true, "same": true, "both": true, "between": true,
}