From 368dc33d90fb9a3ee05ee79469c90e8edc4ec4b0 Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 14:19:45 +0800 Subject: [PATCH 01/14] Update crawler.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit “base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。 我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。 --- crawler/crawler.go | 110 ++++++++++++++++++++++++++++++++------------- 1 file changed, 79 insertions(+), 31 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 0a50caf..16f1d56 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,16 +2,15 @@ package crawler import ( "compress/gzip" - "fmt" - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net/http" "net/url" "regexp" "strings" + + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/util" ) // 蜘蛛抓取页面内容 @@ -25,7 +24,7 @@ func Spider(u string, num int) { }() config.Mux.Lock() - fmt.Printf("\rStart %d Spider...", config.Progress) + // fmt.Printf("\rStart %d Spider...", config.Progress) config.Progress++ config.Mux.Unlock() //标记完成 @@ -53,6 +52,12 @@ func Spider(u string, num int) { request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### + //增加header选项 if cmd.C != "" { request.Header.Set("Cookie", cmd.C) @@ -62,27 +67,6 @@ func Spider(u string, num int) { util.SetHeadersConfig(&request.Header) } - //处理返回结果 - //tr := &http.Transport{ - // TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - //} - //client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second, - // Transport: tr, - // CheckRedirect: func(req *http.Request, via []*http.Request) error { - // if len(via) >= 10 { - // return fmt.Errorf("Too many redirects") - // } - // if len(via) > 0 { - // if via[0] != nil && via[0].URL != nil { - // result.Redirect[via[0].URL.String()] = true - // } else { - // result.Redirect[req.URL.String()] = true - // } - // - // } - // return nil - // }, - //} response, err := client.Do(request) if err != nil { return @@ -115,6 +99,7 @@ func Spider(u string, num int) { host := response.Request.URL.Host scheme := response.Request.URL.Scheme source := scheme + "://" + host + path + judge_base := false //#### //处理base标签 re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]") base := re.FindAllStringSubmatch(result, -1) @@ -127,13 +112,76 @@ func Spider(u string, num int) { } else { path = "/" } + } else { //#### + re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + pattern := "[^.\\/\\w]" + re, _ := regexp.Compile(pattern) + // 检查字符串是否包含匹配的字符 + result := re.MatchString(base[0][1]) + if !result { // 字符串中没有其他字符 + if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发 + judge_base = true + path = path[:strings.LastIndex(path, "/")] + base[0][1][1:] + } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base路径从上一级目录出发 + judge_base = true + pattern := "^[./]+$" + matched, _ := regexp.MatchString(pattern, base[0][1]) + if matched { // 仅处理的base路径中只有 ./ 的 + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } else { + find_str := "" + if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") { + find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3] + } else { + find_str = base[0][1][3:] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1][3:] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http + judge_base = true + path = base[0][1] + } else if len(base[0][1]) > 0 { + judge_base = true + if base[0][1][0] == 47 { //base路径从根目录出发 + path = base[0][1] + } else { //base路径未指明从哪路出发 + find_str := "" + if strings.Contains(base[0][1], "/") { + find_str = base[0][1][:strings.Index(base[0][1], "/")] + } else { + find_str = base[0][1] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + } + } } + is = false <-config.Ch //提取js - jsFind(result, host, scheme, path, u, num) + jsFind(result, host, scheme, path, u, num, judge_base) //提取url - urlFind(result, host, scheme, path, u, num) + urlFind(result, host, scheme, path, u, num, judge_base) + // 防止base判断错误 + if judge_base { + jsFind(result, host, scheme, path, u, num, false) + urlFind(result, host, scheme, path, u, num, false) + } //提取信息 infoFind(result, source) @@ -142,8 +190,8 @@ func Spider(u string, num int) { // 打印Validate进度 func PrintProgress() { config.Mux.Lock() - num := len(result.ResultJs) + len(result.ResultUrl) - fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100) + // num := len(result.ResultJs) + len(result.ResultUrl) + // fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100) config.Progress++ config.Mux.Unlock() } From 5bfd2ff117ccd59733623a4ed3e20622a948e2b1 Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 14:56:09 +0800 Subject: [PATCH 02/14] Update crawler.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者对于“base标签”的判断,仅是判断HTML代码中是否存在base标签。 但是,“base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。 我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。 --- crawler/crawler.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 16f1d56..7d7f91f 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,6 +2,7 @@ package crawler import ( "compress/gzip" + "fmt" "io" "net/http" "net/url" @@ -10,6 +11,7 @@ import ( "github.com/pingc0y/URLFinder/cmd" "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/result" "github.com/pingc0y/URLFinder/util" ) @@ -24,7 +26,7 @@ func Spider(u string, num int) { }() config.Mux.Lock() - // fmt.Printf("\rStart %d Spider...", config.Progress) + fmt.Printf("\rStart %d Spider...", config.Progress) config.Progress++ config.Mux.Unlock() //标记完成 @@ -120,15 +122,15 @@ func Spider(u string, num int) { re, _ := regexp.Compile(pattern) // 检查字符串是否包含匹配的字符 result := re.MatchString(base[0][1]) - if !result { // 字符串中没有其他字符 - if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发 + if !result { // 字符串中没有其他特殊字符 + if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发 judge_base = true path = path[:strings.LastIndex(path, "/")] + base[0][1][1:] - } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base路径从上一级目录出发 + } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发 judge_base = true pattern := "^[./]+$" matched, _ := regexp.MatchString(pattern, base[0][1]) - if matched { // 仅处理的base路径中只有 ./ 的 + if matched { // 处理的 base 路径中只有 ./的 path = path[:strings.LastIndex(path, "/")+1] + base[0][1] } else { find_str := "" @@ -143,14 +145,14 @@ func Spider(u string, num int) { path = path[:strings.LastIndex(path, "/")+1] + base[0][1] } } - } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http + } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议 judge_base = true path = base[0][1] } else if len(base[0][1]) > 0 { judge_base = true - if base[0][1][0] == 47 { //base路径从根目录出发 + if base[0][1][0] == 47 { //base 路径从根目录出发 path = base[0][1] - } else { //base路径未指明从哪路出发 + } else { //base 路径未指明从哪出发 find_str := "" if strings.Contains(base[0][1], "/") { find_str = base[0][1][:strings.Index(base[0][1], "/")] @@ -190,8 +192,8 @@ func Spider(u string, num int) { // 打印Validate进度 func PrintProgress() { config.Mux.Lock() - // num := len(result.ResultJs) + len(result.ResultUrl) - // fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100) + num := len(result.ResultJs) + len(result.ResultUrl) + fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100) config.Progress++ config.Mux.Unlock() } From 9038dff18baff642938198705f70a6c97882407c Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:02:29 +0800 Subject: [PATCH 03/14] Update find.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在 jsFind 和 urlFind 两个函数中添加了一段代码,用来将 ”base 标签“ 和 url 合并在一起。 --- crawler/find.go | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/crawler/find.go b/crawler/find.go index 3e06e2e..85cf6a8 100644 --- a/crawler/find.go +++ b/crawler/find.go @@ -1,16 +1,17 @@ package crawler import ( + "regexp" + "strings" + "github.com/pingc0y/URLFinder/cmd" "github.com/pingc0y/URLFinder/config" "github.com/pingc0y/URLFinder/mode" "github.com/pingc0y/URLFinder/result" - "regexp" - "strings" ) // 分析内容中的js -func jsFind(cont, host, scheme, path, source string, num int) { +func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) { var cata string care := regexp.MustCompile("/.*/{1}|/") catae := care.FindAllString(path, -1) @@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) { if js[0] == "" { continue } + + // base标签的处理 #### + if judge_base { + js[0] = path + js[0] + } + if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") { switch AppendJs(js[0], source) { case 0: @@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) { } // 分析内容中的url -func urlFind(cont, host, scheme, path, source string, num int) { +func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) { var cata string care := regexp.MustCompile("/.*/{1}|/") catae := care.FindAllString(path, -1) @@ -104,6 +111,7 @@ func urlFind(cont, host, scheme, path, source string, num int) { } else { cata = catae[0] } + host = scheme + "://" + host //url匹配正则 @@ -111,7 +119,6 @@ func urlFind(cont, host, scheme, path, source string, num int) { for _, re := range config.UrlFind { reg := regexp.MustCompile(re) urls := reg.FindAllStringSubmatch(cont, -1) - //fmt.Println(urls) urls = urlFilter(urls) //循环提取url放到结果中 @@ -119,6 +126,12 @@ func urlFind(cont, host, scheme, path, source string, num int) { if url[0] == "" { continue } + + // base标签的处理 #### + if judge_base { + url[0] = path + url[0] + } + if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") { switch AppendUrl(url[0], source) { case 0: From 12175603f5f3cb9ab6851e1e028dd1948c738e7b Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:16:17 +0800 Subject: [PATCH 04/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者仅是使用requests库获取页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用 go_rod 库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) (3)去除了 探测非本ip下url 的功能,因为非本ip下url的数据冗余,绝大多数数据与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已注释)。 --- crawler/run.go | 311 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 294 insertions(+), 17 deletions(-) diff --git a/crawler/run.go b/crawler/run.go index baa0cc2..fafb5ef 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -3,25 +3,35 @@ package crawler import ( "bufio" "crypto/tls" + "encoding/json" "flag" "fmt" - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/mode" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net" "net/http" "net/url" "os" "regexp" + "strconv" "strings" "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/mode" + "github.com/pingc0y/URLFinder/result" + "github.com/pingc0y/URLFinder/util" ) var client *http.Client +// 全局变量 存储body +var ResBodyMap = make(map[string]string, 0) +var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0) + func load() { if cmd.I { @@ -177,13 +187,23 @@ func ValidateFF() { for i, s := range result.ResultJs { config.Wg.Add(1) config.Jsch <- 1 - go JsState(s.Url, i, result.ResultJs[i].Source) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag) } //验证URL状态 for i, s := range result.ResultUrl { config.Wg.Add(1) config.Urlch <- 1 - go UrlState(s.Url, i) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go UrlState(s.Url, i, rod_flag) } config.Wg.Wait() @@ -199,14 +219,245 @@ func ValidateFF() { AddSource() } +// 定义函数 url_parse,参数是一个字符串 u,返回值是三个字符串 +func url_parse(u string) (string, string, string) { + // 解析 u 为一个 URL 对象 + u_str, err := url.Parse(u) + // 如果解析出错,就返回空字符串 + if err != nil { + return "", "", "" + } + // 获取 URL 对象的 host、scheme、path 属性 + host := u_str.Host + scheme := u_str.Scheme + path := u_str.Path + // 返回这三个属性的值 + return host, scheme, path +} + +// 提取响应体中的 Base 标签信息 +func extractBase(host, scheme, path, result string) (string, string, string, bool) { + judge_base := false + //处理base标签 + re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + host = regexp.MustCompile("http.*?//([^/]+)").FindAllStringSubmatch(base[0][1], -1)[0][1] + scheme = regexp.MustCompile("(http.*?)://").FindAllStringSubmatch(base[0][1], -1)[0][1] + paths := regexp.MustCompile("http.*?//.*?(/.*)").FindAllStringSubmatch(base[0][1], -1) + if len(paths) > 0 { + path = paths[0][1] + } else { + path = "/" + } + } else { //#### + re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + pattern := "[^.\\/\\w]" + re, _ := regexp.Compile(pattern) + // 检查字符串是否包含匹配的字符 + result := re.MatchString(base[0][1]) + if !result { // 字符串中没有其他特殊字符 + if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base 路径从当前目录出发 + judge_base = true + path = path[:strings.LastIndex(path, "/")] + base[0][1][1:] + } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base 路径从上一级目录出发 + judge_base = true + pattern := "^[./]+$" + matched, _ := regexp.MatchString(pattern, base[0][1]) + if matched { // 处理的 base 路径中只有 ./的 + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } else { + find_str := "" + if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") { + find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3] + } else { + find_str = base[0][1][3:] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1][3:] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base 标签包含协议 + judge_base = true + path = base[0][1] + } else if len(base[0][1]) > 0 { + judge_base = true + if base[0][1][0] == 47 { //base 路径从根目录出发 + path = base[0][1] + } else { //base 路径未指明从哪路出发 + find_str := "" + if strings.Contains(base[0][1], "/") { + find_str = base[0][1][:strings.Index(base[0][1], "/")] + } else { + find_str = base[0][1] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + } + } + } + return host, scheme, path, judge_base +} + +// 获取网页加载的事件的响应体 +func rod_spider(u string, num int) { + //初始化浏览器 + launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors"). + NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito"). + Set("no-default-browser-check").Set("disable-dev-shm-usage"). + Set("disable-plugins").MustLaunch() + browser := rod.New().ControlURL(launch).MustConnect() + + //添加关闭 + defer browser.Close() + + // 设置浏览器的证书错误处理,忽略所有证书错误 + browser.MustIgnoreCertErrors(true) + + // 设置浏览器打开的页面 + pageTarget := proto.TargetCreateTarget{URL: u} + page, err := browser.Page(pageTarget) + if err != nil { + fmt.Println(err) + } + + // 在最后关闭页面 + defer func() { + err := page.Close() + if err != nil { + // 处理错误 + fmt.Println(err) + } + }() + + // 设置页面的超时时间为 40 秒 + page = page.Timeout(40 * time.Second) + + // 创建一个空的 map,键是 proto.NetworkRequestID 类型,值是 string 类型 + requestMap := make(map[string]string, 0) + + // 使用 go 语句开启一个协程,在协程中处理页面的一些事件 + go page.EachEvent(func(e *proto.PageJavascriptDialogOpening) { + // 处理 JavaScript 对话框 + _ = proto.PageHandleJavaScriptDialog{Accept: true, PromptText: ""}.Call(page) + }, func(e *proto.NetworkResponseReceived) { + // 获取请求的 ID 和 URL + ResponseURL := e.Response.URL + // fmt.Println(e.Response.URL, e.RequestID) + ResHeaderMap[ResponseURL] = e.Response.Headers + + // 在 requestMap 中填充数据 + requestMap[ResponseURL] = "" + + })() + + // 等待页面加载完成,并处理可能出现的错误 + pageLoadErr := page.WaitLoad() + if pageLoadErr != nil { + fmt.Println(pageLoadErr) + } + + // 等待页面的 DOM 结构稳定 + page.WaitStable(2 * time.Second) + + // 打印页面源码 + htmlStr, err := page.HTML() + if err != nil { + fmt.Println(err) + } + + for url, _ := range requestMap { + // 调用 page.GetResource 方法来获取响应体 + ResponseBody, _ := page.GetResource(url) + requestMap[url] = string(ResponseBody) + } + + // 存储页面源码 + requestMap[u] = string(htmlStr) + // fmt.Println(requestMap[u]) + + // 遍历响应体,提取 Base 标签、提取 js 、提取 url 、 + for url, body := range requestMap { + // 判断响应体是否为空 + if len(body) == 0 { + continue + } + + // 遍历 BodyFiler 切片中的每个元素 + re := regexp.MustCompile("\\.jpeg\\?|\\.jpg\\?|\\.png\\?|.gif\\?|www\\.w3\\.org|example\\.com|.*,$|.*\\.jpeg$|.*\\.jpg$|.*\\.png$|.*\\.gif$|.*\\.ico$|.*\\.svg$|.*\\.vue$|.*\\.ts$") + if re.MatchString(url) { + continue + } + + // fmt.Println("目标url及响应体信息: ", url, len(body)) + + // 添加body数据 + ResBodyMap[url] = body + + // 将响应头数据转换成map存储 + Res_header := make(map[string]string, 0) + if len(ResHeaderMap[url]) != 0 { + data, err := json.Marshal(ResHeaderMap[url]) + if err != nil { + fmt.Println(err) + } + err = json.Unmarshal(data, &Res_header) + if err != nil { + fmt.Println(err) + } + } + + // 添加首页动态加载的数据 + if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") { + result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) + // AppendJs(url, u) + } else { + result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) + // AppendUrl(url, u) + } + + host, scheme, path := url_parse(url) + + judge_base := false + host, scheme, path, judge_base = extractBase(host, scheme, path, body) + + //提取js + jsFind(body, host, scheme, path, u, num, judge_base) + //提取url + urlFind(body, host, scheme, path, u, num, judge_base) + // 防止base判断错误 + if judge_base { + jsFind(body, host, scheme, path, u, num, false) + urlFind(body, host, scheme, path, u, num, false) + } + + } + +} + func start(u string) { fmt.Println("Target URL: " + u) - config.Wg.Add(1) - config.Ch <- 1 - go Spider(u, 1) - config.Wg.Wait() - config.Progress = 1 - fmt.Printf("\r\nSpider OK \n") + + // config.Wg.Add(1) + // config.Ch <- 1 + // go Spider(u, 1) // ### + rod_spider(u, 1) + // config.Wg.Wait() + // config.Progress = 1 + + fmt.Printf("\r\nRod_Spider OK \n") result.ResultUrl = util.RemoveRepeatElement(result.ResultUrl) result.ResultJs = util.RemoveRepeatElement(result.ResultJs) if cmd.S != "" { @@ -217,13 +468,23 @@ func start(u string) { for i, s := range result.ResultJs { config.Wg.Add(1) config.Jsch <- 1 - go JsState(s.Url, i, result.ResultJs[i].Source) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag) } //验证URL状态 for i, s := range result.ResultUrl { config.Wg.Add(1) config.Urlch <- 1 - go UrlState(s.Url, i) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go UrlState(s.Url, i, rod_flag) } config.Wg.Wait() @@ -241,7 +502,7 @@ func start(u string) { func Res() { if len(result.ResultJs) == 0 && len(result.ResultUrl) == 0 { - fmt.Println("未获取到数据") + fmt.Println(os.Stdout, cmd.U, "Data not captured") return } //打印还是输出 @@ -273,11 +534,20 @@ func AppendJs(ur string, urltjs string) int { if err != nil { return 2 } + + // 过滤其他ip #### + host1, _, _ := url_parse(ur) + host2, _, _ := url_parse(urltjs) + if host1 != host2 { + return 2 + } + for _, eachItem := range result.ResultJs { if eachItem.Url == ur { return 0 } } + result.ResultJs = append(result.ResultJs, mode.Link{Url: ur}) if strings.HasSuffix(urltjs, ".js") { result.Jsinurl[ur] = result.Jsinurl[urltjs] @@ -301,6 +571,14 @@ func AppendUrl(ur string, urlturl string) int { if err != nil { return 2 } + + // 过滤其他ip #### + host1, _, _ := url_parse(ur) + host2, _, _ := url_parse(urlturl) + if host1 != host2 { + return 2 + } + for _, eachItem := range result.ResultUrl { if eachItem.Url == ur { return 0 @@ -383,5 +661,4 @@ func Initialization() { result.Jstourl = make(map[string]string) result.Urltourl = make(map[string]string) result.Redirect = make(map[string]bool) - } From bdb85cd457d95b77b648e0666f10550e67d0722c Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:19:50 +0800 Subject: [PATCH 05/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者只是利用requests库获取页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用go_rod库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) (3)去除了探测非ip下url的功能,因为非本ip下url的数据冗长冗余,且大部分数据与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已添加注释)。 --- crawler/run.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/run.go b/crawler/run.go index fafb5ef..49edf4e 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -238,7 +238,7 @@ func url_parse(u string) (string, string, string) { // 提取响应体中的 Base 标签信息 func extractBase(host, scheme, path, result string) (string, string, string, bool) { judge_base := false - //处理base标签 + // 处理base标签 re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]") base := re.FindAllStringSubmatch(result, -1) if len(base) > 0 { @@ -250,7 +250,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo } else { path = "/" } - } else { //#### + } else { // 处理 "base 标签" re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") base := re.FindAllStringSubmatch(result, -1) if len(base) > 0 { From b02ec1cdec92c1041f87f2e11250e0b2b89b3463 Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:23:02 +0800 Subject: [PATCH 06/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用go_rod库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码)(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) (3)去除_探测非ip下url_的功能,因为非本ip下url的数据大部分与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已注释)。 --- crawler/run.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/run.go b/crawler/run.go index 49edf4e..5340c0f 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -313,14 +313,14 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo // 获取网页加载的事件的响应体 func rod_spider(u string, num int) { - //初始化浏览器 + // 初始化浏览器 launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors"). NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito"). Set("no-default-browser-check").Set("disable-dev-shm-usage"). Set("disable-plugins").MustLaunch() browser := rod.New().ControlURL(launch).MustConnect() - //添加关闭 + // 添加关闭 defer browser.Close() // 设置浏览器的证书错误处理,忽略所有证书错误 From 90d26fef8816b7b83c920d0ad1e54866ce6c9e48 Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:32:25 +0800 Subject: [PATCH 07/14] Update state.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在作者代码基础上,(1)增加了请求头的选项(2)增加了存储响应头和响应体的功能 --- crawler/state.go | 68 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/crawler/state.go b/crawler/state.go index 773946f..6034591 100644 --- a/crawler/state.go +++ b/crawler/state.go @@ -1,27 +1,34 @@ package crawler import ( - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/mode" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net/http" "net/url" "regexp" "strconv" "strings" + + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/mode" + "github.com/pingc0y/URLFinder/result" + "github.com/pingc0y/URLFinder/util" ) // 检测js访问状态码 -func JsState(u string, i int, sou string) { +func JsState(u string, i int, sou string, rod_flag bool) { defer func() { config.Wg.Done() <-config.Jsch PrintProgress() }() + + // 首页动态加载的数据 已经存储 + if rod_flag { + return + } + if cmd.S == "" { result.ResultJs[i].Url = u return @@ -53,6 +60,12 @@ func JsState(u string, i int, sou string) { //增加header选项 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### + //加载yaml配置 if cmd.I { util.SetHeadersConfig(&request.Header) @@ -99,25 +112,41 @@ func JsState(u string, i int, sou string) { } else { length = len(dataBytes) } + + res_body := string(dataBytes) + res_headers := make(map[string]string, 0) + // 遍历响应头中的所有键值对 + for k, v := range response.Header { + // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对 + if len(v) > 0 { + res_headers[k] = v[0] + } + } + config.Lock.Lock() if result.Redirect[ur.String()] { code = 302 redirect = response.Request.URL.String() } config.Lock.Unlock() - result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect} + result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } else { result.ResultJs[i].Url = "" } } // 检测url访问状态码 -func UrlState(u string, i int) { +func UrlState(u string, i int, rod_flag bool) { defer func() { config.Wg.Done() <-config.Urlch PrintProgress() }() + + // 首页动态加载的数据 已经存储 + if rod_flag { + return + } if cmd.S == "" { result.ResultUrl[i].Url = u return @@ -148,6 +177,11 @@ func UrlState(u string, i int) { //增加header选项 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### //加载yaml配置 if cmd.I { @@ -194,9 +228,19 @@ func UrlState(u string, i int) { } else { length = len(dataBytes) } - body := string(dataBytes) + + res_body := string(dataBytes) + res_headers := make(map[string]string, 0) + // 遍历响应头中的所有键值对 + for k, v := range response.Header { + // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对 + if len(v) > 0 { + res_headers[k] = v[0] + } + } + re := regexp.MustCompile("<[tT]itle>(.*?)") - title := re.FindAllStringSubmatch(body, -1) + title := re.FindAllStringSubmatch(res_body, -1) config.Lock.Lock() if result.Redirect[ur.String()] { code = 302 @@ -205,9 +249,9 @@ func UrlState(u string, i int) { config.Lock.Unlock() if len(title) != 0 { - result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect} + result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } else { - result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect} + result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } } else { result.ResultUrl[i].Url = "" From 8cbaae1cc9d671bc58cdb2927f7d4eb987286830 Mon Sep 17 00:00:00 2001 From: LZH <128961083+DongfengMissile@users.noreply.github.com> Date: Sat, 21 Oct 2023 15:34:14 +0800 Subject: [PATCH 08/14] Update mode.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在type Link struct中添加了ResponseHeaders和ResponseBody两个字段 --- mode/mode.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mode/mode.go b/mode/mode.go index 300e702..454c7b2 100644 --- a/mode/mode.go +++ b/mode/mode.go @@ -18,12 +18,14 @@ type Config struct { } type Link struct { - Url string - Status string - Size string - Title string - Redirect string - Source string + Url string + Status string + Size string + Title string + Redirect string + Source string + ResponseHeaders map[string]string + ResponseBody string } type Info struct { From 4e775e045bce63c693e3840b2e0e4d9c33dae4da Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 17:28:25 +0800 Subject: [PATCH 09/14] Update crawler.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者对于“base标签”的判断,仅是判断HTML代码中是否存在base标签。 但是,“base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。 我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。 此外,许多页面的请求头中若没有Referer选项会得不到正确的响应结果或者直接访问不了该页面,我在请求头的设置中添加了Referer选项。 --- crawler/crawler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 7d7f91f..a0113c2 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -114,7 +114,7 @@ func Spider(u string, num int) { } else { path = "/" } - } else { //#### + } else { // 处理 "base 标签" re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") base := re.FindAllStringSubmatch(result, -1) if len(base) > 0 { From 98d1e9cdf5f60f959b98a909b0583df36e61ce8a Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 17:40:24 +0800 Subject: [PATCH 10/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) --- crawler/run.go | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/crawler/run.go b/crawler/run.go index 5340c0f..48c0aaa 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -28,7 +28,7 @@ import ( var client *http.Client -// 全局变量 存储body +// 用来存储响应体和响应头数据 var ResBodyMap = make(map[string]string, 0) var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0) @@ -337,7 +337,6 @@ func rod_spider(u string, num int) { defer func() { err := page.Close() if err != nil { - // 处理错误 fmt.Println(err) } }() @@ -401,8 +400,6 @@ func rod_spider(u string, num int) { continue } - // fmt.Println("目标url及响应体信息: ", url, len(body)) - // 添加body数据 ResBodyMap[url] = body @@ -422,10 +419,8 @@ func rod_spider(u string, num int) { // 添加首页动态加载的数据 if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") { result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) - // AppendJs(url, u) } else { result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) - // AppendUrl(url, u) } host, scheme, path := url_parse(url) @@ -535,13 +530,6 @@ func AppendJs(ur string, urltjs string) int { return 2 } - // 过滤其他ip #### - host1, _, _ := url_parse(ur) - host2, _, _ := url_parse(urltjs) - if host1 != host2 { - return 2 - } - for _, eachItem := range result.ResultJs { if eachItem.Url == ur { return 0 @@ -572,13 +560,6 @@ func AppendUrl(ur string, urlturl string) int { return 2 } - // 过滤其他ip #### - host1, _, _ := url_parse(ur) - host2, _, _ := url_parse(urlturl) - if host1 != host2 { - return 2 - } - for _, eachItem := range result.ResultUrl { if eachItem.Url == ur { return 0 From 1646bb9b474f0f1c8a5a1bdc48c9c38814a526cc Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 18:08:31 +0800 Subject: [PATCH 11/14] Update filter.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit url.QueryUnescape()这个函数的使用在某些时候会出现错误; 例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码,输出结果会是空字符串,将导致许多url访问失败。 我添加了小段代码,做出了简单的判断。 --- crawler/filter.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/crawler/filter.go b/crawler/filter.go index e0b5ebe..e4e9f0d 100644 --- a/crawler/filter.go +++ b/crawler/filter.go @@ -1,10 +1,11 @@ package crawler import ( - "github.com/pingc0y/URLFinder/config" "net/url" "regexp" "strings" + + "github.com/pingc0y/URLFinder/config" ) // 过滤JS @@ -12,6 +13,11 @@ func jsFilter(str [][]string) [][]string { //对不需要的数据过滤 for i := range str { + if strings.Contains(str[i][1], "%s%s:%s") { + str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) + str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) + } + str[i][0], _ = url.QueryUnescape(str[i][1]) str[i][0] = strings.TrimSpace(str[i][0]) str[i][0] = strings.Replace(str[i][0], " ", "", -1) @@ -44,12 +50,18 @@ func urlFilter(str [][]string) [][]string { //对不需要的数据过滤 for i := range str { + + if strings.Contains(str[i][1], "%s%s:%s") { + str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) + str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) + } str[i][0], _ = url.QueryUnescape(str[i][1]) str[i][0] = strings.TrimSpace(str[i][0]) str[i][0] = strings.Replace(str[i][0], " ", "", -1) str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1) str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1) str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1) + //去除不存在字符串和数字的url,判断为错误数据 match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0]) if !match { From 1b3b6c9df16956699db3a06707fe6576373df2b8 Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 18:13:28 +0800 Subject: [PATCH 12/14] Update filter.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit url.QueryUnescape()这个函数的使用在某些时候会出现错误; 例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码,将出现错误,str是一个空字符串,这将导致许多url访问失败。 我添加了小段代码,做出了简单的判断。 --- crawler/filter.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crawler/filter.go b/crawler/filter.go index e4e9f0d..e27281d 100644 --- a/crawler/filter.go +++ b/crawler/filter.go @@ -11,8 +11,9 @@ import ( // 过滤JS func jsFilter(str [][]string) [][]string { - //对不需要的数据过滤 + // 对不需要的数据过滤 for i := range str { + // 针对QueryUnescape函数做出了简单的预先处理 if strings.Contains(str[i][1], "%s%s:%s") { str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) @@ -50,7 +51,7 @@ func urlFilter(str [][]string) [][]string { //对不需要的数据过滤 for i := range str { - + // 针对QueryUnescape函数做出了简单的预先处理 if strings.Contains(str[i][1], "%s%s:%s") { str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) From f8ee1af95799bd8001e4d29d21500a821e3e9677 Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 18:23:08 +0800 Subject: [PATCH 13/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) --- crawler/run.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/run.go b/crawler/run.go index 48c0aaa..8e3f1ad 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -314,7 +314,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo // 获取网页加载的事件的响应体 func rod_spider(u string, num int) { // 初始化浏览器 - launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors"). + launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors"). NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito"). Set("no-default-browser-check").Set("disable-dev-shm-usage"). Set("disable-plugins").MustLaunch() From e9faebe3c0f6f1ec3c40fcda1b040ceb55ac2e92 Mon Sep 17 00:00:00 2001 From: LZH <128961083+Liiu04@users.noreply.github.com> Date: Sat, 21 Oct 2023 18:46:38 +0800 Subject: [PATCH 14/14] Update run.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。 我在作者代码的基础上,做了如下修改: (1)使用go_rod库进行浏览器渲染获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) --- crawler/run.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/run.go b/crawler/run.go index 8e3f1ad..d21d996 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -313,7 +313,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo // 获取网页加载的事件的响应体 func rod_spider(u string, num int) { - // 初始化浏览器 + // 初始化浏览器,无头浏览器:Headless(true) launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors"). NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito"). Set("no-default-browser-check").Set("disable-dev-shm-usage").