From 6364958fc2850b987bf64d68b7652b8af375be05 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 01:05:12 +0000 Subject: [PATCH] Modernize project and fix URL resolution bugs - Initialized go.mod and updated outdated imports. - Fixed Link.Url() to correctly resolve absolute and relative URLs without mutating the base URL. - Improved error handling in NewCrawler and Client.Request. - Cleaned up NewClient initialization. - Added comprehensive tests for URL resolution and link clicking. --- client.go | 7 ++-- crawler.go | 11 ++++--- go.mod | 10 ++++++ go.sum | 71 ++++++++++++++++++++++++++++++++++++++++ scour_test.go | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ selection.go | 21 +++++++++--- 6 files changed, 198 insertions(+), 11 deletions(-) create mode 100644 go.mod create mode 100644 go.sum diff --git a/client.go b/client.go index 69f399d..1fd0e87 100644 --- a/client.go +++ b/client.go @@ -16,7 +16,7 @@ type Client struct { // Create a new Client with and embedded // http.Client type func NewClient() *Client { - return &Client{&http.Client{}, &http.Request{}, &http.Response{}} + return &Client{Client: &http.Client{}} } // Request is used to initiate a client request @@ -32,7 +32,10 @@ func (c *Client) Request(method string, url string) (ret *Crawler, err error) { return nil, err } - crawler := NewCrawler(c.resp) + crawler, err := NewCrawler(c.resp) + if err != nil { + return nil, err + } return crawler, nil } diff --git a/crawler.go b/crawler.go index 7f8d502..348dd37 100644 --- a/crawler.go +++ b/crawler.go @@ -4,7 +4,7 @@ import ( "errors" "net/http" - "github.com/puerkitobio/goquery" + "github.com/PuerkitoBio/goquery" ) // The Crawler type wraps goquery's @@ -14,9 +14,12 @@ type Crawler struct { } // Create the new crawler type from the Response -func NewCrawler(resp *http.Response) *Crawler { - doc, _ := goquery.NewDocumentFromResponse(resp) - return &Crawler{doc} +func NewCrawler(resp *http.Response) (*Crawler, error) { + doc, err := goquery.NewDocumentFromResponse(resp) + if err != nil { + return nil, err + } + return &Crawler{doc}, nil } // Look for specific anchor tag within document diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..9d4cf2e --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +module github.com/brettdonohoo/scour + +go 1.24.3 + +require ( + github.com/PuerkitoBio/goquery v1.11.0 + golang.org/x/net v0.49.0 +) + +require github.com/andybalholm/cascadia v1.3.3 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..a57e738 --- /dev/null +++ b/go.sum @@ -0,0 +1,71 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/scour_test.go b/scour_test.go index b3606e4..5515623 100644 --- a/scour_test.go +++ b/scour_test.go @@ -1 +1,90 @@ package scour + +import ( + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestLinkUrlBrittle(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, "
About") + })) + defer ts.Close() + + client := NewClient() + crawler, err := client.Request("GET", ts.URL) + if err != nil { + t.Fatal(err) + } + + link, err := crawler.SelectLink("About") + if err != nil { + t.Fatal(err) + } + + url := link.Url() + expected := ts.URL + "/about" + if url != expected { + t.Errorf("Expected %s, got %s", expected, url) + } +} + +func TestLinkUrlAbsolute(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, "Google") + })) + defer ts.Close() + + client := NewClient() + crawler, _ := client.Request("GET", ts.URL) + link, _ := crawler.SelectLink("Google") + + url := link.Url() + expected := "http://google.com" + if url != expected { + t.Errorf("Expected %s, got %s", expected, url) + } +} + +func TestLinkUrlRelativePath(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, "About") + })) + defer ts.Close() + + client := NewClient() + crawler, _ := client.Request("GET", ts.URL+"/path/") + link, _ := crawler.SelectLink("About") + + url := link.Url() + expected := ts.URL + "/path/about" + if url != expected { + t.Errorf("Expected %s, got %s", expected, url) + } +} + +func TestClickRelative(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/about" { + fmt.Fprintln(w, "Welcome to About Page") + } else { + fmt.Fprintln(w, "About") + } + })) + defer ts.Close() + + client := NewClient() + crawler, _ := client.Request("GET", ts.URL) + link, _ := crawler.SelectLink("About") + crawler, err := client.Click(link) + if err != nil { + t.Fatal(err) + } + + if !strings.Contains(crawler.Find("body").Text(), "Welcome to About Page") { + t.Errorf("Expected 'Welcome to About Page', got '%s'", crawler.Find("body").Text()) + } +} diff --git a/selection.go b/selection.go index 932643c..1a85ec7 100644 --- a/selection.go +++ b/selection.go @@ -3,8 +3,8 @@ package scour import ( "net/url" - "code.google.com/p/go.net/html" - "github.com/puerkitobio/goquery" + "golang.org/x/net/html" + "github.com/PuerkitoBio/goquery" ) // Link is used for holding link data @@ -19,10 +19,21 @@ func (l *Link) Method() string { } // Create the new path for Link -// TODO: Flesh this out func (l *Link) Url() string { - l.url.Path = l.node.Attr[0].Val - return l.url.String() + var href string + for _, attr := range l.node.Attr { + if attr.Key == "href" { + href = attr.Val + break + } + } + + relative, err := url.Parse(href) + if err != nil { + return "" + } + + return l.url.ResolveReference(relative).String() } // Selection embeds goquery's Selection object