From 368dc33d90fb9a3ee05ee79469c90e8edc4ec4b0 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 14:19:45 +0800
Subject: [PATCH 01/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
“base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。
我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。
---
crawler/crawler.go | 110 ++++++++++++++++++++++++++++++++-------------
1 file changed, 79 insertions(+), 31 deletions(-)
diff --git a/crawler/crawler.go b/crawler/crawler.go
index 0a50caf..16f1d56 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -2,16 +2,15 @@ package crawler
import (
"compress/gzip"
- "fmt"
- "github.com/pingc0y/URLFinder/cmd"
- "github.com/pingc0y/URLFinder/config"
- "github.com/pingc0y/URLFinder/result"
- "github.com/pingc0y/URLFinder/util"
"io"
"net/http"
"net/url"
"regexp"
"strings"
+
+ "github.com/pingc0y/URLFinder/cmd"
+ "github.com/pingc0y/URLFinder/config"
+ "github.com/pingc0y/URLFinder/util"
)
// 蜘蛛抓取页面内容
@@ -25,7 +24,7 @@ func Spider(u string, num int) {
}()
config.Mux.Lock()
- fmt.Printf("\rStart %d Spider...", config.Progress)
+ // fmt.Printf("\rStart %d Spider...", config.Progress)
config.Progress++
config.Mux.Unlock()
//标记完成
@@ -53,6 +52,12 @@ func Spider(u string, num int) {
request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快
request.Header.Set("User-Agent", util.GetUserAgent())
request.Header.Set("Accept", "*/*")
+ u_str, err := url.Parse(u)
+ if err != nil {
+ return
+ }
+ request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
+
//增加header选项
if cmd.C != "" {
request.Header.Set("Cookie", cmd.C)
@@ -62,27 +67,6 @@ func Spider(u string, num int) {
util.SetHeadersConfig(&request.Header)
}
- //处理返回结果
- //tr := &http.Transport{
- // TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
- //}
- //client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second,
- // Transport: tr,
- // CheckRedirect: func(req *http.Request, via []*http.Request) error {
- // if len(via) >= 10 {
- // return fmt.Errorf("Too many redirects")
- // }
- // if len(via) > 0 {
- // if via[0] != nil && via[0].URL != nil {
- // result.Redirect[via[0].URL.String()] = true
- // } else {
- // result.Redirect[req.URL.String()] = true
- // }
- //
- // }
- // return nil
- // },
- //}
response, err := client.Do(request)
if err != nil {
return
@@ -115,6 +99,7 @@ func Spider(u string, num int) {
host := response.Request.URL.Host
scheme := response.Request.URL.Scheme
source := scheme + "://" + host + path
+ judge_base := false //####
//处理base标签
re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
base := re.FindAllStringSubmatch(result, -1)
@@ -127,13 +112,76 @@ func Spider(u string, num int) {
} else {
path = "/"
}
+ } else { //####
+ re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
+ base := re.FindAllStringSubmatch(result, -1)
+ if len(base) > 0 {
+ pattern := "[^.\\/\\w]"
+ re, _ := regexp.Compile(pattern)
+ // 检查字符串是否包含匹配的字符
+ result := re.MatchString(base[0][1])
+ if !result { // 字符串中没有其他字符
+ if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发
+ judge_base = true
+ path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
+ } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base路径从上一级目录出发
+ judge_base = true
+ pattern := "^[./]+$"
+ matched, _ := regexp.MatchString(pattern, base[0][1])
+ if matched { // 仅处理的base路径中只有 ./ 的
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ } else {
+ find_str := ""
+ if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
+ find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
+ } else {
+ find_str = base[0][1][3:]
+ }
+ if strings.Contains(path, find_str) {
+ path = path[:strings.Index(path, find_str)] + base[0][1][3:]
+ } else {
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ }
+ }
+ } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http
+ judge_base = true
+ path = base[0][1]
+ } else if len(base[0][1]) > 0 {
+ judge_base = true
+ if base[0][1][0] == 47 { //base路径从根目录出发
+ path = base[0][1]
+ } else { //base路径未指明从哪路出发
+ find_str := ""
+ if strings.Contains(base[0][1], "/") {
+ find_str = base[0][1][:strings.Index(base[0][1], "/")]
+ } else {
+ find_str = base[0][1]
+ }
+ if strings.Contains(path, find_str) {
+ path = path[:strings.Index(path, find_str)] + base[0][1]
+ } else {
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ }
+ }
+ }
+ if !strings.HasSuffix(path, "/") {
+ path += "/"
+ }
+ }
+ }
}
+
is = false
<-config.Ch
//提取js
- jsFind(result, host, scheme, path, u, num)
+ jsFind(result, host, scheme, path, u, num, judge_base)
//提取url
- urlFind(result, host, scheme, path, u, num)
+ urlFind(result, host, scheme, path, u, num, judge_base)
+ // 防止base判断错误
+ if judge_base {
+ jsFind(result, host, scheme, path, u, num, false)
+ urlFind(result, host, scheme, path, u, num, false)
+ }
//提取信息
infoFind(result, source)
@@ -142,8 +190,8 @@ func Spider(u string, num int) {
// 打印Validate进度
func PrintProgress() {
config.Mux.Lock()
- num := len(result.ResultJs) + len(result.ResultUrl)
- fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
+ // num := len(result.ResultJs) + len(result.ResultUrl)
+ // fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
config.Progress++
config.Mux.Unlock()
}
From 5bfd2ff117ccd59733623a4ed3e20622a948e2b1 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 14:56:09 +0800
Subject: [PATCH 02/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者对于“base标签”的判断,仅是判断HTML代码中是否存在base标签。
但是,“base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。
我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。
---
crawler/crawler.go | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/crawler/crawler.go b/crawler/crawler.go
index 16f1d56..7d7f91f 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -2,6 +2,7 @@ package crawler
import (
"compress/gzip"
+ "fmt"
"io"
"net/http"
"net/url"
@@ -10,6 +11,7 @@ import (
"github.com/pingc0y/URLFinder/cmd"
"github.com/pingc0y/URLFinder/config"
+ "github.com/pingc0y/URLFinder/result"
"github.com/pingc0y/URLFinder/util"
)
@@ -24,7 +26,7 @@ func Spider(u string, num int) {
}()
config.Mux.Lock()
- // fmt.Printf("\rStart %d Spider...", config.Progress)
+ fmt.Printf("\rStart %d Spider...", config.Progress)
config.Progress++
config.Mux.Unlock()
//标记完成
@@ -120,15 +122,15 @@ func Spider(u string, num int) {
re, _ := regexp.Compile(pattern)
// 检查字符串是否包含匹配的字符
result := re.MatchString(base[0][1])
- if !result { // 字符串中没有其他字符
- if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发
+ if !result { // 字符串中没有其他特殊字符
+ if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发
judge_base = true
path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
- } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base路径从上一级目录出发
+ } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发
judge_base = true
pattern := "^[./]+$"
matched, _ := regexp.MatchString(pattern, base[0][1])
- if matched { // 仅处理的base路径中只有 ./ 的
+ if matched { // 处理的 base 路径中只有 ./的
path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
} else {
find_str := ""
@@ -143,14 +145,14 @@ func Spider(u string, num int) {
path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
}
}
- } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http
+ } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议
judge_base = true
path = base[0][1]
} else if len(base[0][1]) > 0 {
judge_base = true
- if base[0][1][0] == 47 { //base路径从根目录出发
+ if base[0][1][0] == 47 { //base 路径从根目录出发
path = base[0][1]
- } else { //base路径未指明从哪路出发
+ } else { //base 路径未指明从哪出发
find_str := ""
if strings.Contains(base[0][1], "/") {
find_str = base[0][1][:strings.Index(base[0][1], "/")]
@@ -190,8 +192,8 @@ func Spider(u string, num int) {
// 打印Validate进度
func PrintProgress() {
config.Mux.Lock()
- // num := len(result.ResultJs) + len(result.ResultUrl)
- // fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
+ num := len(result.ResultJs) + len(result.ResultUrl)
+ fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
config.Progress++
config.Mux.Unlock()
}
From 9038dff18baff642938198705f70a6c97882407c Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:02:29 +0800
Subject: [PATCH 03/14] Update find.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
在 jsFind 和 urlFind 两个函数中添加了一段代码,用来将 ”base 标签“ 和 url 合并在一起。
---
crawler/find.go | 23 ++++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/crawler/find.go b/crawler/find.go
index 3e06e2e..85cf6a8 100644
--- a/crawler/find.go
+++ b/crawler/find.go
@@ -1,16 +1,17 @@
package crawler
import (
+ "regexp"
+ "strings"
+
"github.com/pingc0y/URLFinder/cmd"
"github.com/pingc0y/URLFinder/config"
"github.com/pingc0y/URLFinder/mode"
"github.com/pingc0y/URLFinder/result"
- "regexp"
- "strings"
)
// 分析内容中的js
-func jsFind(cont, host, scheme, path, source string, num int) {
+func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) {
var cata string
care := regexp.MustCompile("/.*/{1}|/")
catae := care.FindAllString(path, -1)
@@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) {
if js[0] == "" {
continue
}
+
+ // base标签的处理 ####
+ if judge_base {
+ js[0] = path + js[0]
+ }
+
if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") {
switch AppendJs(js[0], source) {
case 0:
@@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) {
}
// 分析内容中的url
-func urlFind(cont, host, scheme, path, source string, num int) {
+func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) {
var cata string
care := regexp.MustCompile("/.*/{1}|/")
catae := care.FindAllString(path, -1)
@@ -104,6 +111,7 @@ func urlFind(cont, host, scheme, path, source string, num int) {
} else {
cata = catae[0]
}
+
host = scheme + "://" + host
//url匹配正则
@@ -111,7 +119,6 @@ func urlFind(cont, host, scheme, path, source string, num int) {
for _, re := range config.UrlFind {
reg := regexp.MustCompile(re)
urls := reg.FindAllStringSubmatch(cont, -1)
- //fmt.Println(urls)
urls = urlFilter(urls)
//循环提取url放到结果中
@@ -119,6 +126,12 @@ func urlFind(cont, host, scheme, path, source string, num int) {
if url[0] == "" {
continue
}
+
+ // base标签的处理 ####
+ if judge_base {
+ url[0] = path + url[0]
+ }
+
if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") {
switch AppendUrl(url[0], source) {
case 0:
From 12175603f5f3cb9ab6851e1e028dd1948c738e7b Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:16:17 +0800
Subject: [PATCH 04/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者仅是使用requests库获取页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改:
(1)使用 go_rod 库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码)
(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件)
(3)去除了 探测非本ip下url 的功能,因为非本ip下url的数据冗余,绝大多数数据与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已注释)。
---
crawler/run.go | 311 ++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 294 insertions(+), 17 deletions(-)
diff --git a/crawler/run.go b/crawler/run.go
index baa0cc2..fafb5ef 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -3,25 +3,35 @@ package crawler
import (
"bufio"
"crypto/tls"
+ "encoding/json"
"flag"
"fmt"
- "github.com/pingc0y/URLFinder/cmd"
- "github.com/pingc0y/URLFinder/config"
- "github.com/pingc0y/URLFinder/mode"
- "github.com/pingc0y/URLFinder/result"
- "github.com/pingc0y/URLFinder/util"
"io"
"net"
"net/http"
"net/url"
"os"
"regexp"
+ "strconv"
"strings"
"time"
+
+ "github.com/go-rod/rod"
+ "github.com/go-rod/rod/lib/launcher"
+ "github.com/go-rod/rod/lib/proto"
+ "github.com/pingc0y/URLFinder/cmd"
+ "github.com/pingc0y/URLFinder/config"
+ "github.com/pingc0y/URLFinder/mode"
+ "github.com/pingc0y/URLFinder/result"
+ "github.com/pingc0y/URLFinder/util"
)
var client *http.Client
+// 全局变量 存储body
+var ResBodyMap = make(map[string]string, 0)
+var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0)
+
func load() {
if cmd.I {
@@ -177,13 +187,23 @@ func ValidateFF() {
for i, s := range result.ResultJs {
config.Wg.Add(1)
config.Jsch <- 1
- go JsState(s.Url, i, result.ResultJs[i].Source)
+ // 判断响应数据是否已经在页面加载过程存储
+ rod_flag := false
+ if len(ResBodyMap[s.Url]) != 0 {
+ rod_flag = true
+ }
+ go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag)
}
//验证URL状态
for i, s := range result.ResultUrl {
config.Wg.Add(1)
config.Urlch <- 1
- go UrlState(s.Url, i)
+ // 判断响应数据是否已经在页面加载过程存储
+ rod_flag := false
+ if len(ResBodyMap[s.Url]) != 0 {
+ rod_flag = true
+ }
+ go UrlState(s.Url, i, rod_flag)
}
config.Wg.Wait()
@@ -199,14 +219,245 @@ func ValidateFF() {
AddSource()
}
+// 定义函数 url_parse,参数是一个字符串 u,返回值是三个字符串
+func url_parse(u string) (string, string, string) {
+ // 解析 u 为一个 URL 对象
+ u_str, err := url.Parse(u)
+ // 如果解析出错,就返回空字符串
+ if err != nil {
+ return "", "", ""
+ }
+ // 获取 URL 对象的 host、scheme、path 属性
+ host := u_str.Host
+ scheme := u_str.Scheme
+ path := u_str.Path
+ // 返回这三个属性的值
+ return host, scheme, path
+}
+
+// 提取响应体中的 Base 标签信息
+func extractBase(host, scheme, path, result string) (string, string, string, bool) {
+ judge_base := false
+ //处理base标签
+ re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
+ base := re.FindAllStringSubmatch(result, -1)
+ if len(base) > 0 {
+ host = regexp.MustCompile("http.*?//([^/]+)").FindAllStringSubmatch(base[0][1], -1)[0][1]
+ scheme = regexp.MustCompile("(http.*?)://").FindAllStringSubmatch(base[0][1], -1)[0][1]
+ paths := regexp.MustCompile("http.*?//.*?(/.*)").FindAllStringSubmatch(base[0][1], -1)
+ if len(paths) > 0 {
+ path = paths[0][1]
+ } else {
+ path = "/"
+ }
+ } else { //####
+ re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
+ base := re.FindAllStringSubmatch(result, -1)
+ if len(base) > 0 {
+ pattern := "[^.\\/\\w]"
+ re, _ := regexp.Compile(pattern)
+ // 检查字符串是否包含匹配的字符
+ result := re.MatchString(base[0][1])
+ if !result { // 字符串中没有其他特殊字符
+ if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base 路径从当前目录出发
+ judge_base = true
+ path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
+ } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base 路径从上一级目录出发
+ judge_base = true
+ pattern := "^[./]+$"
+ matched, _ := regexp.MatchString(pattern, base[0][1])
+ if matched { // 处理的 base 路径中只有 ./的
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ } else {
+ find_str := ""
+ if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
+ find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
+ } else {
+ find_str = base[0][1][3:]
+ }
+ if strings.Contains(path, find_str) {
+ path = path[:strings.Index(path, find_str)] + base[0][1][3:]
+ } else {
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ }
+ }
+ } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base 标签包含协议
+ judge_base = true
+ path = base[0][1]
+ } else if len(base[0][1]) > 0 {
+ judge_base = true
+ if base[0][1][0] == 47 { //base 路径从根目录出发
+ path = base[0][1]
+ } else { //base 路径未指明从哪路出发
+ find_str := ""
+ if strings.Contains(base[0][1], "/") {
+ find_str = base[0][1][:strings.Index(base[0][1], "/")]
+ } else {
+ find_str = base[0][1]
+ }
+ if strings.Contains(path, find_str) {
+ path = path[:strings.Index(path, find_str)] + base[0][1]
+ } else {
+ path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+ }
+ }
+ }
+ if !strings.HasSuffix(path, "/") {
+ path += "/"
+ }
+ }
+ }
+ }
+ return host, scheme, path, judge_base
+}
+
+// 获取网页加载的事件的响应体
+func rod_spider(u string, num int) {
+ //初始化浏览器
+ launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
+ NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
+ Set("no-default-browser-check").Set("disable-dev-shm-usage").
+ Set("disable-plugins").MustLaunch()
+ browser := rod.New().ControlURL(launch).MustConnect()
+
+ //添加关闭
+ defer browser.Close()
+
+ // 设置浏览器的证书错误处理,忽略所有证书错误
+ browser.MustIgnoreCertErrors(true)
+
+ // 设置浏览器打开的页面
+ pageTarget := proto.TargetCreateTarget{URL: u}
+ page, err := browser.Page(pageTarget)
+ if err != nil {
+ fmt.Println(err)
+ }
+
+ // 在最后关闭页面
+ defer func() {
+ err := page.Close()
+ if err != nil {
+ // 处理错误
+ fmt.Println(err)
+ }
+ }()
+
+ // 设置页面的超时时间为 40 秒
+ page = page.Timeout(40 * time.Second)
+
+ // 创建一个空的 map,键是 proto.NetworkRequestID 类型,值是 string 类型
+ requestMap := make(map[string]string, 0)
+
+ // 使用 go 语句开启一个协程,在协程中处理页面的一些事件
+ go page.EachEvent(func(e *proto.PageJavascriptDialogOpening) {
+ // 处理 JavaScript 对话框
+ _ = proto.PageHandleJavaScriptDialog{Accept: true, PromptText: ""}.Call(page)
+ }, func(e *proto.NetworkResponseReceived) {
+ // 获取请求的 ID 和 URL
+ ResponseURL := e.Response.URL
+ // fmt.Println(e.Response.URL, e.RequestID)
+ ResHeaderMap[ResponseURL] = e.Response.Headers
+
+ // 在 requestMap 中填充数据
+ requestMap[ResponseURL] = ""
+
+ })()
+
+ // 等待页面加载完成,并处理可能出现的错误
+ pageLoadErr := page.WaitLoad()
+ if pageLoadErr != nil {
+ fmt.Println(pageLoadErr)
+ }
+
+ // 等待页面的 DOM 结构稳定
+ page.WaitStable(2 * time.Second)
+
+ // 打印页面源码
+ htmlStr, err := page.HTML()
+ if err != nil {
+ fmt.Println(err)
+ }
+
+ for url, _ := range requestMap {
+ // 调用 page.GetResource 方法来获取响应体
+ ResponseBody, _ := page.GetResource(url)
+ requestMap[url] = string(ResponseBody)
+ }
+
+ // 存储页面源码
+ requestMap[u] = string(htmlStr)
+ // fmt.Println(requestMap[u])
+
+ // 遍历响应体,提取 Base 标签、提取 js 、提取 url 、
+ for url, body := range requestMap {
+ // 判断响应体是否为空
+ if len(body) == 0 {
+ continue
+ }
+
+ // 遍历 BodyFiler 切片中的每个元素
+ re := regexp.MustCompile("\\.jpeg\\?|\\.jpg\\?|\\.png\\?|.gif\\?|www\\.w3\\.org|example\\.com|.*,$|.*\\.jpeg$|.*\\.jpg$|.*\\.png$|.*\\.gif$|.*\\.ico$|.*\\.svg$|.*\\.vue$|.*\\.ts$")
+ if re.MatchString(url) {
+ continue
+ }
+
+ // fmt.Println("目标url及响应体信息: ", url, len(body))
+
+ // 添加body数据
+ ResBodyMap[url] = body
+
+ // 将响应头数据转换成map存储
+ Res_header := make(map[string]string, 0)
+ if len(ResHeaderMap[url]) != 0 {
+ data, err := json.Marshal(ResHeaderMap[url])
+ if err != nil {
+ fmt.Println(err)
+ }
+ err = json.Unmarshal(data, &Res_header)
+ if err != nil {
+ fmt.Println(err)
+ }
+ }
+
+ // 添加首页动态加载的数据
+ if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") {
+ result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
+ // AppendJs(url, u)
+ } else {
+ result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
+ // AppendUrl(url, u)
+ }
+
+ host, scheme, path := url_parse(url)
+
+ judge_base := false
+ host, scheme, path, judge_base = extractBase(host, scheme, path, body)
+
+ //提取js
+ jsFind(body, host, scheme, path, u, num, judge_base)
+ //提取url
+ urlFind(body, host, scheme, path, u, num, judge_base)
+ // 防止base判断错误
+ if judge_base {
+ jsFind(body, host, scheme, path, u, num, false)
+ urlFind(body, host, scheme, path, u, num, false)
+ }
+
+ }
+
+}
+
func start(u string) {
fmt.Println("Target URL: " + u)
- config.Wg.Add(1)
- config.Ch <- 1
- go Spider(u, 1)
- config.Wg.Wait()
- config.Progress = 1
- fmt.Printf("\r\nSpider OK \n")
+
+ // config.Wg.Add(1)
+ // config.Ch <- 1
+ // go Spider(u, 1) // ###
+ rod_spider(u, 1)
+ // config.Wg.Wait()
+ // config.Progress = 1
+
+ fmt.Printf("\r\nRod_Spider OK \n")
result.ResultUrl = util.RemoveRepeatElement(result.ResultUrl)
result.ResultJs = util.RemoveRepeatElement(result.ResultJs)
if cmd.S != "" {
@@ -217,13 +468,23 @@ func start(u string) {
for i, s := range result.ResultJs {
config.Wg.Add(1)
config.Jsch <- 1
- go JsState(s.Url, i, result.ResultJs[i].Source)
+ // 判断响应数据是否已经在页面加载过程存储
+ rod_flag := false
+ if len(ResBodyMap[s.Url]) != 0 {
+ rod_flag = true
+ }
+ go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag)
}
//验证URL状态
for i, s := range result.ResultUrl {
config.Wg.Add(1)
config.Urlch <- 1
- go UrlState(s.Url, i)
+ // 判断响应数据是否已经在页面加载过程存储
+ rod_flag := false
+ if len(ResBodyMap[s.Url]) != 0 {
+ rod_flag = true
+ }
+ go UrlState(s.Url, i, rod_flag)
}
config.Wg.Wait()
@@ -241,7 +502,7 @@ func start(u string) {
func Res() {
if len(result.ResultJs) == 0 && len(result.ResultUrl) == 0 {
- fmt.Println("未获取到数据")
+ fmt.Println(os.Stdout, cmd.U, "Data not captured")
return
}
//打印还是输出
@@ -273,11 +534,20 @@ func AppendJs(ur string, urltjs string) int {
if err != nil {
return 2
}
+
+ // 过滤其他ip ####
+ host1, _, _ := url_parse(ur)
+ host2, _, _ := url_parse(urltjs)
+ if host1 != host2 {
+ return 2
+ }
+
for _, eachItem := range result.ResultJs {
if eachItem.Url == ur {
return 0
}
}
+
result.ResultJs = append(result.ResultJs, mode.Link{Url: ur})
if strings.HasSuffix(urltjs, ".js") {
result.Jsinurl[ur] = result.Jsinurl[urltjs]
@@ -301,6 +571,14 @@ func AppendUrl(ur string, urlturl string) int {
if err != nil {
return 2
}
+
+ // 过滤其他ip ####
+ host1, _, _ := url_parse(ur)
+ host2, _, _ := url_parse(urlturl)
+ if host1 != host2 {
+ return 2
+ }
+
for _, eachItem := range result.ResultUrl {
if eachItem.Url == ur {
return 0
@@ -383,5 +661,4 @@ func Initialization() {
result.Jstourl = make(map[string]string)
result.Urltourl = make(map[string]string)
result.Redirect = make(map[string]bool)
-
}
From bdb85cd457d95b77b648e0666f10550e67d0722c Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:19:50 +0800
Subject: [PATCH 05/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者只是利用requests库获取页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改: (1)使用go_rod库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码) (2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) (3)去除了探测非ip下url的功能,因为非本ip下url的数据冗长冗余,且大部分数据与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已添加注释)。
---
crawler/run.go | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/crawler/run.go b/crawler/run.go
index fafb5ef..49edf4e 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -238,7 +238,7 @@ func url_parse(u string) (string, string, string) {
// 提取响应体中的 Base 标签信息
func extractBase(host, scheme, path, result string) (string, string, string, bool) {
judge_base := false
- //处理base标签
+ // 处理base标签
re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
base := re.FindAllStringSubmatch(result, -1)
if len(base) > 0 {
@@ -250,7 +250,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
} else {
path = "/"
}
- } else { //####
+ } else { // 处理 "base 标签"
re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
base := re.FindAllStringSubmatch(result, -1)
if len(base) > 0 {
From b02ec1cdec92c1041f87f2e11250e0b2b89b3463 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:23:02 +0800
Subject: [PATCH 06/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改: (1)使用go_rod库来动态的获取页面加载过程中所有事件的响应体(包括页面源代码)(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件) (3)去除_探测非ip下url_的功能,因为非本ip下url的数据大部分与本ip无关。若需要此功能,可以在AppendJs和AppendUrl下的代码中删除小段代码(我已注释)。
---
crawler/run.go | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/crawler/run.go b/crawler/run.go
index 49edf4e..5340c0f 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -313,14 +313,14 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
// 获取网页加载的事件的响应体
func rod_spider(u string, num int) {
- //初始化浏览器
+ // 初始化浏览器
launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
Set("no-default-browser-check").Set("disable-dev-shm-usage").
Set("disable-plugins").MustLaunch()
browser := rod.New().ControlURL(launch).MustConnect()
- //添加关闭
+ // 添加关闭
defer browser.Close()
// 设置浏览器的证书错误处理,忽略所有证书错误
From 90d26fef8816b7b83c920d0ad1e54866ce6c9e48 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:32:25 +0800
Subject: [PATCH 07/14] Update state.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
在作者代码基础上,(1)增加了请求头的选项(2)增加了存储响应头和响应体的功能
---
crawler/state.go | 68 +++++++++++++++++++++++++++++++++++++++---------
1 file changed, 56 insertions(+), 12 deletions(-)
diff --git a/crawler/state.go b/crawler/state.go
index 773946f..6034591 100644
--- a/crawler/state.go
+++ b/crawler/state.go
@@ -1,27 +1,34 @@
package crawler
import (
- "github.com/pingc0y/URLFinder/cmd"
- "github.com/pingc0y/URLFinder/config"
- "github.com/pingc0y/URLFinder/mode"
- "github.com/pingc0y/URLFinder/result"
- "github.com/pingc0y/URLFinder/util"
"io"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
+
+ "github.com/pingc0y/URLFinder/cmd"
+ "github.com/pingc0y/URLFinder/config"
+ "github.com/pingc0y/URLFinder/mode"
+ "github.com/pingc0y/URLFinder/result"
+ "github.com/pingc0y/URLFinder/util"
)
// 检测js访问状态码
-func JsState(u string, i int, sou string) {
+func JsState(u string, i int, sou string, rod_flag bool) {
defer func() {
config.Wg.Done()
<-config.Jsch
PrintProgress()
}()
+
+ // 首页动态加载的数据 已经存储
+ if rod_flag {
+ return
+ }
+
if cmd.S == "" {
result.ResultJs[i].Url = u
return
@@ -53,6 +60,12 @@ func JsState(u string, i int, sou string) {
//增加header选项
request.Header.Set("User-Agent", util.GetUserAgent())
request.Header.Set("Accept", "*/*")
+ u_str, err := url.Parse(u)
+ if err != nil {
+ return
+ }
+ request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
+
//加载yaml配置
if cmd.I {
util.SetHeadersConfig(&request.Header)
@@ -99,25 +112,41 @@ func JsState(u string, i int, sou string) {
} else {
length = len(dataBytes)
}
+
+ res_body := string(dataBytes)
+ res_headers := make(map[string]string, 0)
+ // 遍历响应头中的所有键值对
+ for k, v := range response.Header {
+ // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对
+ if len(v) > 0 {
+ res_headers[k] = v[0]
+ }
+ }
+
config.Lock.Lock()
if result.Redirect[ur.String()] {
code = 302
redirect = response.Request.URL.String()
}
config.Lock.Unlock()
- result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect}
+ result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
} else {
result.ResultJs[i].Url = ""
}
}
// 检测url访问状态码
-func UrlState(u string, i int) {
+func UrlState(u string, i int, rod_flag bool) {
defer func() {
config.Wg.Done()
<-config.Urlch
PrintProgress()
}()
+
+ // 首页动态加载的数据 已经存储
+ if rod_flag {
+ return
+ }
if cmd.S == "" {
result.ResultUrl[i].Url = u
return
@@ -148,6 +177,11 @@ func UrlState(u string, i int) {
//增加header选项
request.Header.Set("User-Agent", util.GetUserAgent())
request.Header.Set("Accept", "*/*")
+ u_str, err := url.Parse(u)
+ if err != nil {
+ return
+ }
+ request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
//加载yaml配置
if cmd.I {
@@ -194,9 +228,19 @@ func UrlState(u string, i int) {
} else {
length = len(dataBytes)
}
- body := string(dataBytes)
+
+ res_body := string(dataBytes)
+ res_headers := make(map[string]string, 0)
+ // 遍历响应头中的所有键值对
+ for k, v := range response.Header {
+ // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对
+ if len(v) > 0 {
+ res_headers[k] = v[0]
+ }
+ }
+
re := regexp.MustCompile("<[tT]itle>(.*?)[tT]itle>")
- title := re.FindAllStringSubmatch(body, -1)
+ title := re.FindAllStringSubmatch(res_body, -1)
config.Lock.Lock()
if result.Redirect[ur.String()] {
code = 302
@@ -205,9 +249,9 @@ func UrlState(u string, i int) {
config.Lock.Unlock()
if len(title) != 0 {
- result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect}
+ result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
} else {
- result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect}
+ result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
}
} else {
result.ResultUrl[i].Url = ""
From 8cbaae1cc9d671bc58cdb2927f7d4eb987286830 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:34:14 +0800
Subject: [PATCH 08/14] Update mode.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
在type Link struct中添加了ResponseHeaders和ResponseBody两个字段
---
mode/mode.go | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/mode/mode.go b/mode/mode.go
index 300e702..454c7b2 100644
--- a/mode/mode.go
+++ b/mode/mode.go
@@ -18,12 +18,14 @@ type Config struct {
}
type Link struct {
- Url string
- Status string
- Size string
- Title string
- Redirect string
- Source string
+ Url string
+ Status string
+ Size string
+ Title string
+ Redirect string
+ Source string
+ ResponseHeaders map[string]string
+ ResponseBody string
}
type Info struct {
From 4e775e045bce63c693e3840b2e0e4d9c33dae4da Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 17:28:25 +0800
Subject: [PATCH 09/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者对于“base标签”的判断,仅是判断HTML代码中是否存在base标签。
但是,“base标签” 并不一定以标签的形式()出现在HTML代码之中,有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中,例如:(base: "../script/";baseUrl: './';BASEURL="/baseProj/";basePath = "../../";)。
我添加了一段代码,用来判断以变量赋值的形式存在的 “base标签” 。
此外,许多页面的请求头中若没有Referer选项会得不到正确的响应结果或者直接访问不了该页面,我在请求头的设置中添加了Referer选项。
---
crawler/crawler.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crawler/crawler.go b/crawler/crawler.go
index 7d7f91f..a0113c2 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -114,7 +114,7 @@ func Spider(u string, num int) {
} else {
path = "/"
}
- } else { //####
+ } else { // 处理 "base 标签"
re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
base := re.FindAllStringSubmatch(result, -1)
if len(base) > 0 {
From 98d1e9cdf5f60f959b98a909b0583df36e61ce8a Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 17:40:24 +0800
Subject: [PATCH 10/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改:
(1)使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体(包括页面源代码)
(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件)
---
crawler/run.go | 21 +--------------------
1 file changed, 1 insertion(+), 20 deletions(-)
diff --git a/crawler/run.go b/crawler/run.go
index 5340c0f..48c0aaa 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -28,7 +28,7 @@ import (
var client *http.Client
-// 全局变量 存储body
+// 用来存储响应体和响应头数据
var ResBodyMap = make(map[string]string, 0)
var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0)
@@ -337,7 +337,6 @@ func rod_spider(u string, num int) {
defer func() {
err := page.Close()
if err != nil {
- // 处理错误
fmt.Println(err)
}
}()
@@ -401,8 +400,6 @@ func rod_spider(u string, num int) {
continue
}
- // fmt.Println("目标url及响应体信息: ", url, len(body))
-
// 添加body数据
ResBodyMap[url] = body
@@ -422,10 +419,8 @@ func rod_spider(u string, num int) {
// 添加首页动态加载的数据
if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") {
result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
- // AppendJs(url, u)
} else {
result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
- // AppendUrl(url, u)
}
host, scheme, path := url_parse(url)
@@ -535,13 +530,6 @@ func AppendJs(ur string, urltjs string) int {
return 2
}
- // 过滤其他ip ####
- host1, _, _ := url_parse(ur)
- host2, _, _ := url_parse(urltjs)
- if host1 != host2 {
- return 2
- }
-
for _, eachItem := range result.ResultJs {
if eachItem.Url == ur {
return 0
@@ -572,13 +560,6 @@ func AppendUrl(ur string, urlturl string) int {
return 2
}
- // 过滤其他ip ####
- host1, _, _ := url_parse(ur)
- host2, _, _ := url_parse(urlturl)
- if host1 != host2 {
- return 2
- }
-
for _, eachItem := range result.ResultUrl {
if eachItem.Url == ur {
return 0
From 1646bb9b474f0f1c8a5a1bdc48c9c38814a526cc Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:08:31 +0800
Subject: [PATCH 11/14] Update filter.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
url.QueryUnescape()这个函数的使用在某些时候会出现错误;
例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码,输出结果会是空字符串,将导致许多url访问失败。
我添加了小段代码,做出了简单的判断。
---
crawler/filter.go | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/crawler/filter.go b/crawler/filter.go
index e0b5ebe..e4e9f0d 100644
--- a/crawler/filter.go
+++ b/crawler/filter.go
@@ -1,10 +1,11 @@
package crawler
import (
- "github.com/pingc0y/URLFinder/config"
"net/url"
"regexp"
"strings"
+
+ "github.com/pingc0y/URLFinder/config"
)
// 过滤JS
@@ -12,6 +13,11 @@ func jsFilter(str [][]string) [][]string {
//对不需要的数据过滤
for i := range str {
+ if strings.Contains(str[i][1], "%s%s:%s") {
+ str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+ str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+ }
+
str[i][0], _ = url.QueryUnescape(str[i][1])
str[i][0] = strings.TrimSpace(str[i][0])
str[i][0] = strings.Replace(str[i][0], " ", "", -1)
@@ -44,12 +50,18 @@ func urlFilter(str [][]string) [][]string {
//对不需要的数据过滤
for i := range str {
+
+ if strings.Contains(str[i][1], "%s%s:%s") {
+ str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+ str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+ }
str[i][0], _ = url.QueryUnescape(str[i][1])
str[i][0] = strings.TrimSpace(str[i][0])
str[i][0] = strings.Replace(str[i][0], " ", "", -1)
str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1)
str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1)
str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1)
+
//去除不存在字符串和数字的url,判断为错误数据
match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0])
if !match {
From 1b3b6c9df16956699db3a06707fe6576373df2b8 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:13:28 +0800
Subject: [PATCH 12/14] Update filter.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
url.QueryUnescape()这个函数的使用在某些时候会出现错误;
例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码,将出现错误,str是一个空字符串,这将导致许多url访问失败。
我添加了小段代码,做出了简单的判断。
---
crawler/filter.go | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/crawler/filter.go b/crawler/filter.go
index e4e9f0d..e27281d 100644
--- a/crawler/filter.go
+++ b/crawler/filter.go
@@ -11,8 +11,9 @@ import (
// 过滤JS
func jsFilter(str [][]string) [][]string {
- //对不需要的数据过滤
+ // 对不需要的数据过滤
for i := range str {
+ // 针对QueryUnescape函数做出了简单的预先处理
if strings.Contains(str[i][1], "%s%s:%s") {
str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
@@ -50,7 +51,7 @@ func urlFilter(str [][]string) [][]string {
//对不需要的数据过滤
for i := range str {
-
+ // 针对QueryUnescape函数做出了简单的预先处理
if strings.Contains(str[i][1], "%s%s:%s") {
str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
From f8ee1af95799bd8001e4d29d21500a821e3e9677 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:23:08 +0800
Subject: [PATCH 13/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改:
(1)使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体(包括页面源代码)
(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件)
---
crawler/run.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crawler/run.go b/crawler/run.go
index 48c0aaa..8e3f1ad 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -314,7 +314,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
// 获取网页加载的事件的响应体
func rod_spider(u string, num int) {
// 初始化浏览器
- launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
+ launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors").
NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
Set("no-default-browser-check").Set("disable-dev-shm-usage").
Set("disable-plugins").MustLaunch()
From e9faebe3c0f6f1ec3c40fcda1b040ceb55ac2e92 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:46:38 +0800
Subject: [PATCH 14/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
作者使用requests库,仅获取了页面的源代码,然后进行深度探测,这样获取的数据有限。
我在作者代码的基础上,做了如下修改:
(1)使用go_rod库进行浏览器渲染获取页面加载过程中所有事件的响应体(包括页面源代码)
(2)保存所有响应事件的响应体和响应头(除.jpg、.svg等格式的文件)
---
crawler/run.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crawler/run.go b/crawler/run.go
index 8e3f1ad..d21d996 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -313,7 +313,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
// 获取网页加载的事件的响应体
func rod_spider(u string, num int) {
- // 初始化浏览器
+ // 初始化浏览器,无头浏览器:Headless(true)
launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors").
NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
Set("no-default-browser-check").Set("disable-dev-shm-usage").