diff --git a/newsletter/go.mod b/newsletter/go.mod new file mode 100644 index 0000000..7b40d88 --- /dev/null +++ b/newsletter/go.mod @@ -0,0 +1,13 @@ +module bopbot/newsletter + +go 1.21.3 + +require ( + github.com/PuerkitoBio/goquery v1.8.1 + github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4 +) + +require ( + github.com/andybalholm/cascadia v1.3.1 // indirect + golang.org/x/net v0.7.0 // indirect +) diff --git a/newsletter/go.sum b/newsletter/go.sum new file mode 100644 index 0000000..1a56422 --- /dev/null +++ b/newsletter/go.sum @@ -0,0 +1,37 @@ +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4 h1:0sw0nJM544SpsihWx1bkXdYLQDlzRflMgFJQ4Yih9ts= +github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4/go.mod h1:+ccdNT0xMY1dtc5XBxumbYfOUhmduiGudqaDgD2rVRE= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/newsletter/scrape/.gitignore b/newsletter/scrape/.gitignore new file mode 100644 index 0000000..e171a20 --- /dev/null +++ b/newsletter/scrape/.gitignore @@ -0,0 +1 @@ +scrape diff --git a/newsletter/scrape/Makefile b/newsletter/scrape/Makefile new file mode 100644 index 0000000..e64989a --- /dev/null +++ b/newsletter/scrape/Makefile @@ -0,0 +1,4 @@ +all: + go get ./... + go build -o scrape . + ./scrape diff --git a/newsletter/scrape/main.go b/newsletter/scrape/main.go new file mode 100644 index 0000000..62764b9 --- /dev/null +++ b/newsletter/scrape/main.go @@ -0,0 +1,134 @@ +package main + +import ( + "fmt" + "log" + "net/http" + "net/url" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/yosssi/gohtml" +) + +var DEBUG bool = false + +func GrabPage(url string) *goquery.Document { + // Request the HTML page. + res, err := http.Get(url) + if err != nil { + log.Fatal(err) + } + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + log.Fatal(err) + } + + if DEBUG { + //print out the page + fmt.Printf(">>>status code: %d\n", res.StatusCode) + pageHtml, err := doc.Html() + if err != nil { + log.Fatal(err) + } + fmt.Printf(">>>page html: %s\n", gohtml.Format(pageHtml)) + } + + return doc +} + +func GrabPostTitleFromPage(doc *goquery.Document, titleSelector string) string { + blogTitle := doc.Find(titleSelector).Text() + if DEBUG { + fmt.Printf(">>>post title: %s\n", blogTitle) + } + + return blogTitle +} + +// return the root url domain from provided postUrl +func GetRootUrl(postUrl string) string { + url, err := url.Parse(postUrl) + if err != nil { + log.Fatal(err) + } + url.Path = "/" + + if DEBUG { + fmt.Printf(">>>root url: %s\n", url.String()) + } + + return url.String() +} + +// replace root src and hrefs with FQDN refs +func ReplaceRootRefs(html string, postUrl string) string { + rootReplacementUrl := GetRootUrl(postUrl) + html = strings.Replace(html, "=\"/", "=\"" + rootReplacementUrl, -1) + + return html +} + +func GrabPostFromPage(doc *goquery.Document, postUrl string, postSelector string) string { + // grab post html content + blogContent, err := doc.Find(postSelector).Html() + if err != nil { + log.Fatal(err) + } + + // replace `/` resource refs with FQDN + blogContent = ReplaceRootRefs(blogContent, postUrl) + + if DEBUG { + fmt.Printf(">>>blog post: %s\n", gohtml.Format(blogContent)) + } + + return blogContent +} + +func ScrapePost(postUrl string, titleSelector string, postSelector string) (string, string) { + doc := GrabPage(postUrl) + + title := GrabPostTitleFromPage(doc, titleSelector) + postHtml := GrabPostFromPage(doc, postUrl, postSelector) + return title, postHtml +} + +func main() { + // where can we find the post title and content? + titleSelector := "div#content div.container h2" + postSelector := "div#content div.container" + + + // // example usage + // postUrl := "https://blog.ruebenramirez.com/posts/2023-10-06-gear-update/" + // title, postHtml := ScrapePost(postUrl, titleSelector, postSelector) + // fmt.Printf(">>>blog post title: %s\n", gohtml.Format(title)) + // fmt.Printf(">>>blog post html: %s\n", gohtml.Format(postHtml)) + + // example usage + // postUrl := "https://blog.ruebenramirez.com/posts/2023-10-21-round-2-with-covid/" + // title, postHtml := ScrapePost(postUrl, titleSelector, postSelector) + // fmt.Printf(">>>blog post title: %s\n", gohtml.Format(title)) + // fmt.Printf(">>>blog post html: %s\n", gohtml.Format(postHtml)) + + // example usage + postUrl := "https://blog.ruebenramirez.com/posts/2023-11-05-celebratin-60-pounds-down/" + title, postHtml := ScrapePost(postUrl, titleSelector, postSelector) + fmt.Printf(">>>blog post title: %s\n", gohtml.Format(title)) + fmt.Printf(">>>blog post html: %s\n", gohtml.Format(postHtml)) + + // example usage + // postUrl := "https://blog.ruebenramirez.com/posts/2023-10-30-good-news-from-biopsy/" + // title, postHtml := ScrapePost(postUrl, titleSelector, postSelector) + // fmt.Printf(">>>blog post title: %s\n", gohtml.Format(title)) + // fmt.Printf(">>>blog post html: %s\n", gohtml.Format(postHtml)) +} + +