-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_html.go
More file actions
107 lines (88 loc) · 1.8 KB
/
fetch_html.go
File metadata and controls
107 lines (88 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package main
import (
"bufio"
"crypto/md5"
"encoding/hex"
"fmt"
"io"
"net/http"
"net/url"
"os"
"time"
)
func UrlToName(url string) string {
hasher := md5.New()
hasher.Write([]byte(url))
return hex.EncodeToString(hasher.Sum(nil))
}
func Exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func Download(url string) {
urlname := UrlToName(url)
filename := "./small_data/html/" + urlname
if Exists(filename) {
fmt.Print("Cached ... " + url + " ==> " + filename + "\n")
time.Sleep(10 * time.Millisecond)
return
}
response, err := http.Get(url)
if err != nil {
// panic(err)
return
}
defer response.Body.Close()
file, err := os.Create(filename)
if err != nil {
// panic(err)
return
}
defer file.Close()
io.Copy(file, response.Body)
fmt.Print("Fetched ... " + url + " ==> " + filename + "\n")
time.Sleep(800 * time.Millisecond)
}
func Crawl(c chan string, quit chan bool) {
for url := range c {
Download(url)
}
quit <- true
}
func MyId(url string) int {
hash := md5.Sum([]byte(url))
x := 0
for i := range hash {
x = x + int(hash[i])
}
return x
}
func main() {
thread_num := 500
urlq := make([]chan string, thread_num)
quitq := make([]chan bool, thread_num)
for i := 0; i < thread_num; i++ {
urlq[i] = make(chan string, 200000)
quitq[i] = make(chan bool)
}
stdin_scan := bufio.NewScanner(os.Stdin)
for i := range urlq {
go Crawl(urlq[i], quitq[i])
}
for stdin_scan.Scan() {
inputUrl := stdin_scan.Text()
u, err := url.Parse(inputUrl)
if err != nil {
// TODO something
} else {
id := MyId(u.Host) % thread_num
urlq[id] <- inputUrl
}
}
for i := range urlq {
close(urlq[i])
}
for i := range quitq {
<-quitq[i]
}
}