diff --git a/benchmark/bfs.go b/benchmark/bfs.go new file mode 100644 index 0000000..29897a7 --- /dev/null +++ b/benchmark/bfs.go @@ -0,0 +1,60 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "time" +) + +type Site struct { + Title string `json:"title"` + Body string `json:"body"` + Links []int `json:"links"` + ParentSite string `json:"parent_site"` + SiteURL string `json:"site_url"` + LinkURL []string `json:"link_urls"` + Slug string `json:"slug"` +} + +func main() { + // fmt.Println("Hello, World!") + jsonFile, err := os.Open("../site/data/site_data.json") + if err != nil { + fmt.Println(err) + } + // fmt.Println("opened jsonfile") + + defer jsonFile.Close() + byteValue, _ := ioutil.ReadAll(jsonFile) + var sites []Site + json.Unmarshal(byteValue, &sites) + + var visited [1000000]int + t1 := time.Now() + for i := 0; i < len(sites); i++ { + if visited[i] == 0 { + queue := []int{i} + queueBeg := 0 + for queueBeg < len(queue) { + + for size := len(queue) - queueBeg; size > 0; size-- { + front := queue[queueBeg] + queueBeg++ + if visited[front] == 1 { + continue + } + visited[front] = 1 + // fmt.Println("visiting", front) + for j := 0; j < len(sites[front].Links); j++ { + queue = append(queue, sites[front].Links[j]) + } + } + } + } + } + t2 := time.Now() + elapsed := t2.Sub(t1) + fmt.Println(elapsed) +} diff --git a/benchmark/dfs.go b/benchmark/dfs.go new file mode 100644 index 0000000..dd8d4ea --- /dev/null +++ b/benchmark/dfs.go @@ -0,0 +1,58 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "time" +) + +type Site struct { + Title string `json:"title"` + Body string `json:"body"` + Links []int `json:"links"` + ParentSite string `json:"parent_site"` + SiteURL string `json:"site_url"` + LinkURL []string `json:"link_urls"` + Slug string `json:"slug"` +} + +func main() { + // fmt.Println("Hello, World!") + jsonFile, err := os.Open("../site/data/site_data.json") + if err != nil { + fmt.Println(err) + } + // fmt.Println("opened jsonfile") + + defer jsonFile.Close() + byteValue, _ := ioutil.ReadAll(jsonFile) + var sites []Site + json.Unmarshal(byteValue, &sites) + + var visited [1000000]int + var stack []int + t1 := time.Now() + for i := 0; i < len(sites); i++ { + if visited[i] == 0 { + stack = append(stack, i) + for len(stack) > 0 { + top := stack[len(stack)-1] + // pop here + stack = stack[:len(stack)-1] + if visited[top] == 0 { + visited[top] = 1 + // fmt.Println("visiting", top) + for j := 0; j < len(sites[top].Links); j++ { + stack = append(stack, sites[top].Links[j]) + } + } + } + + } + } + t2 := time.Now() + elapsed := t2.Sub(t1) + fmt.Println(elapsed) +} diff --git a/benchmark/map.go b/benchmark/map.go new file mode 100644 index 0000000..4238f8a --- /dev/null +++ b/benchmark/map.go @@ -0,0 +1,165 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "regexp" + "runtime" + "strings" + //"time" +) + +type Site struct { + Title string `json:"title"` + Body string `json:"body"` + Links []int `json:"links"` + ParentSite string `json:"parent_site"` + SiteURL string `json:"site_url"` + LinkURL []string `json:"link_urls"` + Slug string `json:"slug"` +} + +const ( + ALBHABET_SIZE = 26 +) + +/* +type trieNode struct { + childrens [ALBHABET_SIZE]*trieNode + isWordEnd bool +} + +type trie struct { + root *trieNode +} + +func initTrie() *trie { + return &trie{ + root: &trieNode{}, + } +} + +func (t *trie) insert(word string) { + wordLength := len(word) + current := t.root + for i := 0; i < wordLength; i++ { + index := word[i] - 'a' + if current.childrens[index] == nil { + current.childrens[index] = &trieNode{} + } + current = current.childrens[index] + } + current.isWordEnd = true +} + +func (t *trie) find(word string) bool { + wordLength := len(word) + current := t.root + for i := 0; i < wordLength; i++ { + index := word[i] - 'a' + if current.childrens[index] == nil { + return false + } + current = current.childrens[index] + } + if current.isWordEnd { + return true + } + return false +} +*/ +func PrintMemUsage() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + // For info on each, see: https://golang.org/pkg/runtime/#MemStats + fmt.Printf("HeapAlloc = %v MiB", bToMb(m.HeapAlloc)) + fmt.Printf("\tTotalAlloc = %v MiB", bToMb(m.TotalAlloc)) + fmt.Printf("\tSys = %v MiB", bToMb(m.Sys)) + fmt.Printf("\tNumGC = %v\n", m.NumGC) +} + +func bToMb(b uint64) uint64 { + return b / 1024 / 1024 +} + +func main() { + jsonFile, err := os.Open("../site/data/site_data.json") + if err != nil { + fmt.Println(err) + } + + defer jsonFile.Close() + byteValue, _ := ioutil.ReadAll(jsonFile) + var sites []Site + json.Unmarshal(byteValue, &sites) + + myMap := make(map[string]bool) + // trie := initTrie() + //var sum time.Duration + + //no_of_words, err := strconv.Atoi(os.Args[1]) + //k := 0 + //fl := 0 + runtime.GC() + PrintMemUsage() + for i := 0; i < len(sites); i++ { + sentence := strings.Split(sites[i].Body, " ") + for j := 0; j < len(sentence); j++ { + sentence[j] = strings.ToLower(sentence[j]) + matched, _ := regexp.MatchString(`^[a-z]*$`, sentence[j]) + // fmt.Println(matched, sentence[j]) + // t1 := time.Now() + if matched { + //trie.insert(sentence[j]) + // myMap[sentence[j]] = append(myMap[sentence[j]], sites[i].SiteURL) + myMap[sentence[j]] = true + //k++ + } + /*if k >= no_of_words+100 { + fl = 1 + break + } + */ + // t2 := time.Now() + // elapsed := t2.Sub(t1) + // sum += elapsed + } + } + runtime.GC() + PrintMemUsage() + + // t1 := time.Now() + /* + k = 0 + fl = 0 + for i := 0; i < len(sites); i++ { + sentence := strings.Split(sites[i].Body, " ") + for j := 0; j < len(sentence); j++ { + sentence[j] = strings.ToLower(sentence[j]) + matched, _ := regexp.MatchString(`^[a-z]*$`, sentence[j]) + if matched { + t1 := time.Now() + if myMap[sentence[j]] { + } + t2 := time.Now() + elapsed := t2.Sub(t1) + sum += elapsed + k++ + if k >= no_of_words { + fl = 1 + break + } + } + } + if fl == 1 { + break + } + } + */ + //fmt.Println(sum) + + // t2 := time.Now() + // fmt.Println(elapsed) +} diff --git a/benchmark/trie.go b/benchmark/trie.go new file mode 100644 index 0000000..bdddf73 --- /dev/null +++ b/benchmark/trie.go @@ -0,0 +1,147 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "regexp" + "runtime" + "strings" +) + +type Site struct { + Title string `json:"title"` + Body string `json:"body"` + Links []int `json:"links"` + ParentSite string `json:"parent_site"` + SiteURL string `json:"site_url"` + LinkURL []string `json:"link_urls"` + Slug string `json:"slug"` +} + +const ( + ALBHABET_SIZE = 26 +) + +type trieNode struct { + childrens [ALBHABET_SIZE]*trieNode + isWordEnd bool +} + +type trie struct { + root *trieNode +} + +func initTrie() *trie { + return &trie{ + root: &trieNode{}, + } +} + +func PrintMemUsage() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + // For info on each, see: https://golang.org/pkg/runtime/#MemStats + fmt.Printf("Alloc = %v MiB", bToMb(m.Alloc)) + fmt.Printf("\tTotalAlloc = %v MiB", bToMb(m.TotalAlloc)) + fmt.Printf("\tSys = %v MiB", bToMb(m.Sys)) + fmt.Printf("\tNumGC = %v\n", m.NumGC) +} + +func bToMb(b uint64) uint64 { + return b / 1024 / 1024 +} + +func (t *trie) insert(word string) { + wordLength := len(word) + current := t.root + for i := 0; i < wordLength; i++ { + index := word[i] - 'a' + if current.childrens[index] == nil { + current.childrens[index] = &trieNode{} + } + current = current.childrens[index] + } + current.isWordEnd = true +} + +func (t *trie) find(word string) bool { + wordLength := len(word) + current := t.root + for i := 0; i < wordLength; i++ { + index := word[i] - 'a' + if current.childrens[index] == nil { + return false + } + current = current.childrens[index] + } + if current.isWordEnd { + return true + } + return false +} + +func main() { + jsonFile, err := os.Open("../site/data/site_data.json") + if err != nil { + fmt.Println(err) + } + + defer jsonFile.Close() + byteValue, _ := ioutil.ReadAll(jsonFile) + var sites []Site + json.Unmarshal(byteValue, &sites) + + trie := initTrie() +// no_of_words, err := strconv.Atoi(os.Args[1]) + PrintMemUsage() + for i := 0; i < len(sites); i++ { + sentence := strings.Split(sites[i].Body, " ") + for j := 0; j < len(sentence); j++ { + sentence[j] = strings.ToLower(sentence[j]) + matched, _ := regexp.MatchString(`^[a-z]*$`, sentence[j]) + // fmt.Println(matched, sentence[j]) + // t1 := time.Now() + if matched { + trie.insert(sentence[j]) + } + // t2 := time.Now() + // elapsed := t2.Sub(t1) + // sum += elapsed + } + } + runtime.GC() + PrintMemUsage() + +// t1 := time.Now() + +// k = 0 +// fl = 0 +// for i := 0; i < len(sites); i++ { +// sentence := strings.Split(sites[i].Body, " ") +// for j := 0; j < len(sentence); j++ { +// sentence[j] = strings.ToLower(sentence[j]) +// matched, _ := regexp.MatchString(`^[a-z]*$`, sentence[j]) + +// if matched { +// t1 := time.Now() +// trie.find(sentence[j]) +// t2 := time.Now() +// elapsed := t2.Sub(t1) +// sum += elapsed +// k++ +// if k >= no_of_words { +// fl = 1 +// break +// } +// } +// } +// if fl == 1 { +// break +// } +// } +// fmt.Println(sum) + // t2 := time.Now() + // fmt.Println(elapsed) +} \ No newline at end of file diff --git a/crawler/crawler.go b/crawler/crawler.go new file mode 100644 index 0000000..56f3c2d --- /dev/null +++ b/crawler/crawler.go @@ -0,0 +1,75 @@ +package crawler + +import ( + "fmt" + "sync" + + "github.com/siva2204/web-crawler/queue" +) + +// crawler bot type +type Crawler struct { + Wg sync.WaitGroup + Threads int + Queue *queue.Queue +} + +// run method starts crawling +func (c *Crawler) Run() { + // check if the url is already crawled + + if c.Queue.Len() == 0 { + fmt.Println("queue is empty add some seed url to crawl") + return + } + + ch := make(chan string, 10) + + for i := 0; i < c.Threads; i++ { + c.Wg.Add(1) + + go func(i int) { + for { + fmt.Println("receiving") + url, ok := <-ch + fmt.Println("enqueued", url) + fmt.Printf("crawling the %s url, now..", url) + if !ok { + c.Wg.Done() + return + } + + // crawl with the url + urls, err := uRLScrape(url) + + if err != nil { + fmt.Printf("Error crawling url %+v", err) + c.Wg.Done() + return + } + + for _, url := range urls { + c.Queue.Enqueue(url) + } + // enqueue the all the related url + } + }(i) + } + + // traversing the queue + // BFS + for { + if c.Queue.Len() != 0 { + fmt.Println("dequed", c.Queue.FrontQueue()) + + ch <- c.Queue.Dequeue() + } + + // TODO + // implementing something to stop the crawling + // may be with select and one more stop channel + } + + close(ch) + c.Wg.Wait() +} diff --git a/crawler/soup.go b/crawler/soup.go index e36552f..3f0d580 100644 --- a/crawler/soup.go +++ b/crawler/soup.go @@ -1,17 +1,18 @@ -package main +package crawler import ( - "regexp" "log" "net/http" + "regexp" + "github.com/PuerkitoBio/goquery" - "github.com/jdkato/prose/v2" "github.com/bbalet/stopwords" + "github.com/jdkato/prose/v2" ) var IsLetter = regexp.MustCompile(`^[a-z]+$`).MatchString -func URLScrape(url string) ([]string, error) { +func uRLScrape(url string) ([]string, error) { // Request the HTML page. res, err := http.Get(url) if err != nil { @@ -31,9 +32,8 @@ func URLScrape(url string) ([]string, error) { // array of url var urls []string - // Find the review items - doc.Find("a").Each(func(i int , s *goquery.Selection) { + doc.Find("a").Each(func(i int, s *goquery.Selection) { // For each item found, get the href href, _ := s.Attr("href") // push url to array @@ -63,37 +63,26 @@ func PDataScrape(url string) ( // array of url var wordArray []string - // Find the review items // doc.Find("p").Each(func(i int , s *goquery.Selection) { - // For each item found, get the text - text := doc.Text() // s.Text() - - //Return a string where HTML tags and French stop words has been removed - cleanContent := stopwords.CleanString(text, "en", true) - - data, err := prose.NewDocument(cleanContent) - if err != nil { - log.Fatal(err) - } + // For each item found, get the text + text := doc.Text() // s.Text() - // Iterate over the doc's tokens: - for _, tok := range data.Tokens() { - // log.Println(tok.Text, tok.Tag, tok.Label) - if IsLetter(tok.Text) { - wordArray = append(wordArray, tok.Text) - } - } - // }) - return wordArray, nil -} + //Return a string where HTML tags and French stop words has been removed + cleanContent := stopwords.CleanString(text, "en", true) -func main(){ - urls, err := PDataScrape("http://localhost:5000/compellingly-embrace-from-generation-x-is") + data, err := prose.NewDocument(cleanContent) if err != nil { log.Fatal(err) } - for _, url := range urls { - log.Println(url) + + // Iterate over the doc's tokens: + for _, tok := range data.Tokens() { + // log.Println(tok.Text, tok.Tag, tok.Label) + if IsLetter(tok.Text) { + wordArray = append(wordArray, tok.Text) + } } + // }) + return wordArray, nil } diff --git a/main.go b/main.go index 37fc80f..894478c 100644 --- a/main.go +++ b/main.go @@ -4,6 +4,8 @@ import ( "fmt" "github.com/siva2204/web-crawler/config" + "github.com/siva2204/web-crawler/crawler" + "github.com/siva2204/web-crawler/queue" redis_crawler "github.com/siva2204/web-crawler/redis" ) @@ -12,4 +14,14 @@ func main() { redis_crawler.CreateClient(config.Getenv("REDIS_HOST"), config.Getenv("REDIS_PORT")) redis_crawler.Client.Insert("hello", []string{"a", "b", "c"}) redis_crawler.Client.Append("world", []string{"a", "b", "c"}) + + crawler := crawler.Crawler{ + Threads: 50, + Queue: &queue.Queue{}, + } + + crawler.Queue.Enqueue("http://localhost:5000") + + crawler.Run() + } diff --git a/queue/queue.go b/queue/queue.go new file mode 100644 index 0000000..0b0cd58 --- /dev/null +++ b/queue/queue.go @@ -0,0 +1,84 @@ +package queue + +import "sync" + +// Node will be store the value and the next node as well +type Node struct { + url string + next *Node +} + +// Queue structure is tell us what our head is and what tail should be with length of the list +type Queue struct { + head *Node + tail *Node + length int + sync.RWMutex +} + +// enqueue it will be added new value into queue +func (ll *Queue) Enqueue(n string) { + ll.Lock() + defer ll.Unlock() + + var newNode Node // create new Node + newNode.url = n // set the data + + if ll.tail != nil { + ll.tail.next = &newNode + } + + ll.tail = &newNode + + if ll.head == nil { + ll.head = &newNode + } + ll.length++ +} + +// dequeue it will be removed the first value into queue (First In First Out) +func (ll *Queue) Dequeue() string { + ll.Lock() + defer ll.Unlock() + if ll.IsEmpty() { + return "" // if is empty return -1 + } + data := ll.head.url + + ll.head = ll.head.next + + if ll.head == nil { + ll.tail = nil + } + + ll.length-- + return data +} + +// isEmpty it will check our list is empty or not +func (ll *Queue) IsEmpty() bool { + ll.RLock() + defer ll.RUnlock() + return ll.length == 0 +} + +// len is return the length of queue +func (ll *Queue) Len() int { + ll.RLock() + defer ll.RUnlock() + return ll.length +} + +// frontQueue it will return the front data +func (ll *Queue) FrontQueue() string { + ll.RLock() + defer ll.RUnlock() + return ll.head.url +} + +// backQueue it will return the back data +func (ll *Queue) BackQueue() string { + ll.RLock() + defer ll.RUnlock() + return ll.tail.url +}