diff --git a/internal/engine/engine.go b/internal/engine/engine.go index 529d4e6..ece2c36 100644 --- a/internal/engine/engine.go +++ b/internal/engine/engine.go @@ -20,7 +20,7 @@ type Engine struct { indexMgr IndexManager // `library` is a "set" that stores documents names to avoid adding the same document multiple times. - library map[string]struct{} + library map[string]int // `processor` is used to process the tokens before adding them to the index and before querying the index, // it removes stop words and apply stemming and normalization to the tokens. @@ -34,19 +34,26 @@ func NewEngine(processor textprocessing.Processor, delimiterManager *tokenizer.D return &Engine{ docs: make([]*internal.Document, 0), indexMgr: idxMgr, - library: make(map[string]struct{}), + library: make(map[string]int), processor: processor, delimiterManager: delimiterManager, } } func (e *Engine) AddDocument(doc *internal.Document) { - if _, ok := e.library[doc.Name]; !ok { - doc.ID = e.GetNextDocID() + if _, ok := e.library[doc.GetFilePath()]; !ok { + logger.Info(EnginePrefix, "Adding document %s", doc.GetFilePath()) + + doc.ID = e.GetNextDocID() e.docs = append(e.docs, doc) - e.library[doc.DirectoryPath] = struct{}{} - e.parseDocument(doc) + e.library[doc.GetFilePath()] = doc.ID + } else { // document already exists, update the document by removing it from the index and re-adding it + logger.Warn(EnginePrefix, "Updating document %s", doc.GetFilePath()) + + doc.ID = e.library[doc.GetFilePath()] + e.indexMgr.Remove(doc.ID) } + e.parseDocument(doc) } func (e *Engine) ProcessToken(token string) string { diff --git a/internal/engine/index_manager.go b/internal/engine/index_manager.go index 6f6fab9..49a63fa 100644 --- a/internal/engine/index_manager.go +++ b/internal/engine/index_manager.go @@ -37,6 +37,20 @@ func (idx *IndexManager) PutSlice(key string, values []int) { } } +func (idx *IndexManager) Remove(docID int) { + logger.Info(IndexManagerPrefix, fmt.Sprintf("Removing document ID %d from index, A LINEAR SEARCH is performed.", docID)) + + numRemove := 0 + + for key := range idx.index { + if idx.index[key].Remove(docID) { + numRemove++ + } + } + + logger.Info(IndexManagerPrefix, fmt.Sprintf("Removed document ID %d, %d Keys are affected.", docID, numRemove)) +} + func (idx *IndexManager) Get(key string) ordered.OrderedStructure[int] { if _, ok := idx.index[key]; !ok { logger.Warn(IndexManagerPrefix, fmt.Sprintf("Key %s not found in index", key)) diff --git a/internal/structures/ordered/ordered_slice.go b/internal/structures/ordered/ordered_slice.go index 1d10904..03d6474 100644 --- a/internal/structures/ordered/ordered_slice.go +++ b/internal/structures/ordered/ordered_slice.go @@ -32,11 +32,11 @@ func NewOrderedSliceWithSlice[Entry constraints.Integer](slice []Entry) OrderedS return &OrderedSlice[Entry]{data: newSlice} } -func (o *OrderedSlice[Entry]) InsertSorted(entry Entry) { +func (o *OrderedSlice[Entry]) InsertSorted(entry Entry) bool { var idx = o.UpperBound(entry) if idx-1 >= 0 && o.data[idx-1] == entry { // neglect duplicates - return + return false } o.data = append(o.data, entry) @@ -45,6 +45,8 @@ func (o *OrderedSlice[Entry]) InsertSorted(entry Entry) { for i := idx + 1; i < len(o.data); i++ { o.data[i], swp = swp, o.data[i] } + + return true; } func (o *OrderedSlice[Entry]) BinarySearch(entry Entry) int { @@ -194,3 +196,17 @@ func (s1 *OrderedSlice[Entry]) Union(s2 OrderedStructure[Entry]) OrderedStructur return res } + +func (s *OrderedSlice[Entry]) Remove(entry Entry) bool { + if s == nil || s.IsEmpty() { + return false + } + + var idx = s.BinarySearch(entry) + if idx == -1 { + return false + } + + s.data = append(s.data[:idx], s.data[idx+1:]...) + return true +} diff --git a/internal/structures/ordered/ordered_structure.go b/internal/structures/ordered/ordered_structure.go index 9bd110c..36e186e 100644 --- a/internal/structures/ordered/ordered_structure.go +++ b/internal/structures/ordered/ordered_structure.go @@ -11,7 +11,8 @@ type SetOperations[Entry constraints.Ordered] interface { type OrderedStructure[Entry constraints.Ordered] interface { SetOperations[Entry] - InsertSorted(Entry) + InsertSorted(Entry) bool + Remove(Entry) bool GetLength() int IsEmpty() bool diff --git a/internal/structures/ordered/skip_pointer_list.go b/internal/structures/ordered/skip_pointer_list.go index 0d97c20..6f45dbd 100644 --- a/internal/structures/ordered/skip_pointer_list.go +++ b/internal/structures/ordered/skip_pointer_list.go @@ -84,23 +84,21 @@ func (s *SkipPointerList[Entry]) UpdateSkipPointers() { } } -func (s *SkipPointerList[Entry]) InsertSorted(entry Entry) { +func (s *SkipPointerList[Entry]) InsertSorted(entry Entry) bool { newNode := &SkipNode[Entry]{entry: entry} if s.head == nil { s.head = newNode s.tail = newNode s.size++ - return + return true } if s.head.entry > entry { - s.pushFront(entry) - return + return s.pushFront(entry) } if s.tail.entry < entry { - s.pushBack(entry) - return + return s.pushBack(entry) } curr := s.head @@ -114,7 +112,7 @@ func (s *SkipPointerList[Entry]) InsertSorted(entry Entry) { // neglect duplicates if curr.next != nil && curr.next.entry == entry { - return + return false } newNode.next = curr.next @@ -129,6 +127,8 @@ func (s *SkipPointerList[Entry]) InsertSorted(entry Entry) { if s.size >= (s.currBlockSize+1)*(s.currBlockSize+1) { s.UpdateSkipPointers() } + + return true } func (s *SkipPointerList[Entry]) GetLength() int { @@ -141,10 +141,9 @@ func (s *SkipPointerList[Entry]) IsEmpty() bool { // Used internally for `SetOperations` functions, // where it is guaranteed that the insertion is always sorted and added to the end -func (s *SkipPointerList[Entry]) pushBack(entry Entry) { +func (s *SkipPointerList[Entry]) pushBack(entry Entry) bool { if s.IsEmpty() { - s.InsertSorted(entry) - return + return s.InsertSorted(entry) } if s.tail.entry > entry { @@ -152,18 +151,19 @@ func (s *SkipPointerList[Entry]) pushBack(entry Entry) { } if s.tail.entry == entry { - return + return false } s.tail.next = &SkipNode[Entry]{entry: entry} s.tail = s.tail.next s.size++ + + return true } -func (s *SkipPointerList[Entry]) pushFront(entry Entry) { +func (s *SkipPointerList[Entry]) pushFront(entry Entry) bool { if s.IsEmpty() { - s.InsertSorted(entry) - return + return s.InsertSorted(entry) } if s.head.entry < entry { @@ -171,12 +171,14 @@ func (s *SkipPointerList[Entry]) pushFront(entry Entry) { } if s.head.entry == entry { - return + return false } newNode := &SkipNode[Entry]{entry: entry, next: s.head} s.head = newNode s.size++ + + return true } func (s *SkipPointerList[Entry]) At(index int) Entry { @@ -285,3 +287,37 @@ func (s1 *SkipPointerList[Entry]) Union(s2 OrderedStructure[Entry]) OrderedStruc return res } + +func (s *SkipPointerList[Entry]) Remove(entry Entry) bool { + if s.IsEmpty() { + return false + } + + if s.head.entry == entry { + s.head = s.head.next + s.size-- + return true + } + + curr := s.head + for curr.next != nil && curr.next.entry < entry { + if curr.skip != nil && curr.skip.entry < entry { + curr = curr.skip + } else { + curr = curr.next + } + } + + if curr.next == nil || curr.next.entry != entry { + return false + } + + curr.next = curr.next.next + s.size-- + + if curr.next == nil { + s.tail = curr + } + + return true +}