From 8a049c78813bfedfe9a865918dc4900eb4889b18 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Mon, 30 Oct 2023 08:03:41 -0400 Subject: [PATCH 1/3] starting of the dumper --- Makefile | 2 ++ fts-lmdb.go | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..237b6d0 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +build: + go build -o microfts diff --git a/fts-lmdb.go b/fts-lmdb.go index 5572cb5..6739391 100644 --- a/fts-lmdb.go +++ b/fts-lmdb.go @@ -9,11 +9,12 @@ import ( "io/ioutil" "os" "regexp" + "runtime/debug" "sort" "strconv" "strings" "time" - + "github.com/AskAlexSharov/lmdb-go/lmdb" ) @@ -246,6 +247,7 @@ func cmdInfo(cfg *lmdbConfigStruct) { cfg.open(false) defer cfg.env.Close() cfg.view(func() { + cfg.debugInfo() if len(cfg.args) == 0 { cfg.totalInfo() } else if len(cfg.args) == 1 { @@ -254,6 +256,75 @@ func cmdInfo(cfg *lmdbConfigStruct) { }) } + +func (cfg *lmdbConfigStruct) debugInfo() { + first := true + cfg.iterate(cfg.gramDb, func(cur *lmdb.Cursor, k, v []byte) { + if first { + first = false + return + } + oids := oidListFor(v) + freq := oids.totalOids() + //grm1 := k[0] >> 8 + //grm2 := k[1] & 0xFF + + //fmt.Printf("%s%s", + // strconv.FormatUint(uint64(grm>>8), 16), + // strconv.FormatUint(uint64(grm&0xFF), 16)) + g1 := gramString(gram((int(k[0])<< 8)|int(k[1]))) + + fmt.Printf("%d\t%s\tDEBUG3\n", freq,g1) + // Skip system record + //if k == systemID { + // return + //} + + // Get gram value + + + // Get OID list + //oids := oidListFor(v) + + // Get frequency + + + // Track co-occurrence with other grams + //co := make(map[gram]int) + + // Iterate over all OIDs + // for _, oidBytes := range oids { + // for len(oidBytes) > 0 { + // oid, rest := getNumOrPanic(oidBytes) + // oidBytes = rest + + // // // Load chunk + // chunk := cfg.getChunk(cfg.oidKey(oid)) + + // // // Increment co-occurrence for each gram + // stra := escape(string(chunk.data)) + // // //for _, gram := range chunk.data { + // // //co[stra]++ + // // //for g, _ := range chunk.data { + // // //co[stra]++ + // fmt.Printf("DEBUG:%s", stra) + // // //} + // } + // } + + // Print results + //fmt.Printf("%s: %d", gramString(grm), freq) + + //for g, c := range co { + //fmt.Printf(" %s:%d", gramString(g), c) + //} + + //fmt.Println() + + }) +} + + func (cfg *lmdbConfigStruct) totalInfo() { if cfg.groups { cfg.iterate(cfg.groupNameDb, func(cur *lmdb.Cursor, k []byte, v []byte) { @@ -344,6 +415,10 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { cfg.iterate(cfg.chunkDb, func(cur *lmdb.Cursor, k, v []byte) { totalBytes += len(k) + len(v) chunkBytes += len(k) + len(v) + + chunk := decodeChunk(v) + //key := decodeChunk(k) + fmt.Printf("DEBUG CHUNK STR:%s\tVALUE:%s\tKEY:%s\n",k,v, chunk) }) first := true cfg.iterate(cfg.gramDb, func(cur *lmdb.Cursor, k, v []byte) { @@ -352,6 +427,11 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { return } oids := oidListFor(v) + chunk := decodeChunk(v) + for oid := range oids { + fmt.Printf("DEBUG STR:%s\tVALUE:%s\tOID:%d\n",k,v,oid) + fmt.Printf("DEBUG2 STR:%s\tVALUE:%s\tOID:%d\n",k,chunk,oid) + } totalBytes += len(k) + len(v) gramBytes += len(k) + len(v) oidTot := oids.totalOids() @@ -494,7 +574,7 @@ func cmdChunk(cfg *lmdbConfigStruct) { _, err := hex.Decode(grams, []byte(cfg.args[1])) check(err) for i := 0; i < len(grams); i += 2 { - cfg.addGramEntry(gram((int(grams[i])<<8)|int(grams[i+1])), oid, d) + cfg.addGramEntry(gram((int(grams[i])<< 8)|int(grams[i+1])), oid, d) } } else { grams := strings.Split(cfg.args[1], cfg.delimiter) @@ -1762,6 +1842,7 @@ func getCountedBytes(bytes []byte) (result []byte, rest []byte) { func getNumOrPanic(bytes []byte) (uint64, []byte) { result, bytes, err := getNum(bytes) if err != nil { + debug.PrintStack() exitError("End of entry while reading number", ERROR) } return result, bytes From 21f3308e7567597c4f688b98b57428509cebae57 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Mon, 30 Oct 2023 09:27:04 -0400 Subject: [PATCH 2/3] now it is dumping out the context of the find --- Makefile | 8 +++++ fts-lmdb.go | 87 +++++++++++++++++++++++------------------------------ 2 files changed, 45 insertions(+), 50 deletions(-) diff --git a/Makefile b/Makefile index 237b6d0..0c4f30c 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,10 @@ build: go build -o microfts + +mkindex: + rm index + ./microfts create index + find -type f -name \*.go |xargs ./microfts input index + +test: build + ./microfts info -grams index >debug.txt diff --git a/fts-lmdb.go b/fts-lmdb.go index 6739391..8996729 100644 --- a/fts-lmdb.go +++ b/fts-lmdb.go @@ -14,7 +14,7 @@ import ( "strconv" "strings" "time" - + "github.com/AskAlexSharov/lmdb-go/lmdb" ) @@ -266,64 +266,51 @@ func (cfg *lmdbConfigStruct) debugInfo() { } oids := oidListFor(v) freq := oids.totalOids() - //grm1 := k[0] >> 8 - //grm2 := k[1] & 0xFF - - //fmt.Printf("%s%s", - // strconv.FormatUint(uint64(grm>>8), 16), - // strconv.FormatUint(uint64(grm&0xFF), 16)) g1 := gramString(gram((int(k[0])<< 8)|int(k[1]))) - fmt.Printf("%d\t%s\tDEBUG3\n", freq,g1) - // Skip system record - //if k == systemID { - // return - //} - - // Get gram value - - // Get OID list - //oids := oidListFor(v) - - // Get frequency - - - // Track co-occurrence with other grams - //co := make(map[gram]int) - - // Iterate over all OIDs - // for _, oidBytes := range oids { - // for len(oidBytes) > 0 { - // oid, rest := getNumOrPanic(oidBytes) - // oidBytes = rest - - // // // Load chunk - // chunk := cfg.getChunk(cfg.oidKey(oid)) + for _, oidBytes := range oids { + for len(oidBytes) > 0 { + oid, rest := getNumOrPanic(oidBytes) + oidBytes = rest + chunk := cfg.getChunk(cfg.oidKey(oid)) + data := chunk.data + lineNo, data := getNumOrPanic(data) + start, data := getNumOrPanic(data) + _, data = getNumOrPanic(data) + strStart, data := getNumOrPanic(data) + strLen, _ := getNumOrPanic(data) + gid := chunk.gid + group := cfg.getGroupWithGid(gid) + name := group.groupName + contents := readFile(name) + + start1 := int(strStart) + end := start1 + int(strLen) + text := contents[start1:end] + + // Print + fmt.Printf("DEBUG4: freq:%d token:%s oid:%n start:%d len:%d line:%d start:%d file:%s text:%s\n", + freq,g1, + oid, strStart, strLen,int(lineNo), int(start), name, escape(text)) - // // // Increment co-occurrence for each gram - // stra := escape(string(chunk.data)) - // // //for _, gram := range chunk.data { - // // //co[stra]++ - // // //for g, _ := range chunk.data { - // // //co[stra]++ - // fmt.Printf("DEBUG:%s", stra) - // // //} - // } - // } - + /// now we want to tokenize the results + + } + } + // Print results //fmt.Printf("%s: %d", gramString(grm), freq) - + //for g, c := range co { - //fmt.Printf(" %s:%d", gramString(g), c) + //fmt.Printf(" %s:%d", gramString(g), c) //} //fmt.Println() - + }) } - + func (cfg *lmdbConfigStruct) totalInfo() { if cfg.groups { @@ -418,7 +405,7 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { chunk := decodeChunk(v) //key := decodeChunk(k) - fmt.Printf("DEBUG CHUNK STR:%s\tVALUE:%s\tKEY:%s\n",k,v, chunk) + fmt.Printf("DEBUG CHUNK STR:%s\tVALUE:%s\tKEY:%s gid:%d count:%n char:%s\n",k,v, chunk.gid, chunk.gramCount, rune(chunk.data[0])) }) first := true cfg.iterate(cfg.gramDb, func(cur *lmdb.Cursor, k, v []byte) { @@ -427,11 +414,11 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { return } oids := oidListFor(v) - chunk := decodeChunk(v) + chunk := decodeChunk(v) for oid := range oids { fmt.Printf("DEBUG STR:%s\tVALUE:%s\tOID:%d\n",k,v,oid) fmt.Printf("DEBUG2 STR:%s\tVALUE:%s\tOID:%d\n",k,chunk,oid) - } + } totalBytes += len(k) + len(v) gramBytes += len(k) + len(v) oidTot := oids.totalOids() From b15a53320b1d428ed43eb1504f39d9e0c52584bc Mon Sep 17 00:00:00 2001 From: mike dupont Date: Mon, 30 Oct 2023 10:21:18 -0400 Subject: [PATCH 3/3] now it is printing the context of grams, this is good enought to start to analyse --- fts-lmdb.go | 61 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/fts-lmdb.go b/fts-lmdb.go index 8996729..a0af4cd 100644 --- a/fts-lmdb.go +++ b/fts-lmdb.go @@ -267,7 +267,7 @@ func (cfg *lmdbConfigStruct) debugInfo() { oids := oidListFor(v) freq := oids.totalOids() g1 := gramString(gram((int(k[0])<< 8)|int(k[1]))) - + for _, oidBytes := range oids { for len(oidBytes) > 0 { @@ -280,7 +280,7 @@ func (cfg *lmdbConfigStruct) debugInfo() { _, data = getNumOrPanic(data) strStart, data := getNumOrPanic(data) strLen, _ := getNumOrPanic(data) - gid := chunk.gid + gid := chunk.gid group := cfg.getGroupWithGid(gid) name := group.groupName contents := readFile(name) @@ -289,11 +289,46 @@ func (cfg *lmdbConfigStruct) debugInfo() { end := start1 + int(strLen) text := contents[start1:end] - // Print - fmt.Printf("DEBUG4: freq:%d token:%s oid:%n start:%d len:%d line:%d start:%d file:%s text:%s\n", - freq,g1, - oid, strStart, strLen,int(lineNo), int(start), name, escape(text)) + // + allgrams := grams(true,text) + itemsBefore := []string{} + itemsAfter := []string{} + + // Initialize a variable to track the number of items before and after. + itemsCountBefore := 0 + itemsCountAfter := 0 + found := 0 + for grm := range allgrams { + + cur := gramString(grm) + if (g1 == cur) { + found = 1 + }else { + if (found == 1 ) { + itemsAfter = append(itemsAfter, cur) + itemsCountAfter++ + } else { + itemsBefore = append(itemsBefore, cur) + itemsCountBefore++ + } + } + //oids := cfg.getGram(grm) + // Print + + } + + fmt.Printf("DEBUG4: freq:%d token:%s oid:%n start:%d len:%d line:%d start:%d file:%s text:%s gram:%s afer:%s\n", + freq,g1, + oid, + strStart, + strLen, + int(lineNo), + int(start), + name, + escape(text), + itemsBefore, itemsAfter) + /// now we want to tokenize the results } @@ -403,9 +438,9 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { totalBytes += len(k) + len(v) chunkBytes += len(k) + len(v) - chunk := decodeChunk(v) + //chunk := decodeChunk(v) //key := decodeChunk(k) - fmt.Printf("DEBUG CHUNK STR:%s\tVALUE:%s\tKEY:%s gid:%d count:%n char:%s\n",k,v, chunk.gid, chunk.gramCount, rune(chunk.data[0])) + //fmt.Printf("DEBUG CHUNK STR:%s\tVALUE:%s\tKEY:%s gid:%d count:%n char:%s\n",k,v, chunk.gid, chunk.gramCount, rune(chunk.data[0])) }) first := true cfg.iterate(cfg.gramDb, func(cur *lmdb.Cursor, k, v []byte) { @@ -414,11 +449,11 @@ func (cfg *lmdbConfigStruct) displayGrams(chunks float64) { return } oids := oidListFor(v) - chunk := decodeChunk(v) - for oid := range oids { - fmt.Printf("DEBUG STR:%s\tVALUE:%s\tOID:%d\n",k,v,oid) - fmt.Printf("DEBUG2 STR:%s\tVALUE:%s\tOID:%d\n",k,chunk,oid) - } + //chunk := decodeChunk(v) + //for oid := range oids { + //fmt.Printf("DEBUG STR:%s\tVALUE:%s\tOID:%d\n",k,v,oid) + //fmt.Printf("DEBUG2 STR:%s\tVALUE:%s\tOID:%d\n",k,chunk,oid) + //} totalBytes += len(k) + len(v) gramBytes += len(k) + len(v) oidTot := oids.totalOids()