From 368dc33d90fb9a3ee05ee79469c90e8edc4ec4b0 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 14:19:45 +0800
Subject: [PATCH 01/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

“base标签” 并不一定以标签的形式（<base href="xxx"/>）出现在HTML代码之中，有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中，例如：（base: "../script/"；baseUrl: './'；BASEURL="/baseProj/"；basePath = "../../"；）。
我添加了一段代码，用来判断以变量赋值的形式存在的 “base标签” 。
---
 crawler/crawler.go | 110 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 31 deletions(-)
diff --git a/crawler/crawler.go b/crawler/crawler.go
index 0a50caf..16f1d56 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -2,16 +2,15 @@ package crawler
 
 import (
 	"compress/gzip"
-	"fmt"
-	"github.com/pingc0y/URLFinder/cmd"
-	"github.com/pingc0y/URLFinder/config"
-	"github.com/pingc0y/URLFinder/result"
-	"github.com/pingc0y/URLFinder/util"
 	"io"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strings"
+
+	"github.com/pingc0y/URLFinder/cmd"
+	"github.com/pingc0y/URLFinder/config"
+	"github.com/pingc0y/URLFinder/util"
 )
 
 // 蜘蛛抓取页面内容
@@ -25,7 +24,7 @@ func Spider(u string, num int) {
 
 	}()
 	config.Mux.Lock()
-	fmt.Printf("\rStart %d Spider...", config.Progress)
+	// fmt.Printf("\rStart %d Spider...", config.Progress)
 	config.Progress++
 	config.Mux.Unlock()
 	//标记完成
@@ -53,6 +52,12 @@ func Spider(u string, num int) {
 	request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快
 	request.Header.Set("User-Agent", util.GetUserAgent())
 	request.Header.Set("Accept", "*/*")
+	u_str, err := url.Parse(u)
+	if err != nil {
+		return
+	}
+	request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
+
 	//增加header选项
 	if cmd.C != "" {
 		request.Header.Set("Cookie", cmd.C)
@@ -62,27 +67,6 @@ func Spider(u string, num int) {
 		util.SetHeadersConfig(&request.Header)
 	}
 
-	//处理返回结果
-	//tr := &http.Transport{
-	//	TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
-	//}
-	//client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second,
-	//	Transport: tr,
-	//	CheckRedirect: func(req *http.Request, via []*http.Request) error {
-	//		if len(via) >= 10 {
-	//			return fmt.Errorf("Too many redirects")
-	//		}
-	//		if len(via) > 0 {
-	//			if via[0] != nil && via[0].URL != nil {
-	//				result.Redirect[via[0].URL.String()] = true
-	//			} else {
-	//				result.Redirect[req.URL.String()] = true
-	//			}
-	//
-	//		}
-	//		return nil
-	//	},
-	//}
 	response, err := client.Do(request)
 	if err != nil {
 		return
@@ -115,6 +99,7 @@ func Spider(u string, num int) {
 	host := response.Request.URL.Host
 	scheme := response.Request.URL.Scheme
 	source := scheme + "://" + host + path
+	judge_base := false //####
 	//处理base标签
 	re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
 	base := re.FindAllStringSubmatch(result, -1)
@@ -127,13 +112,76 @@ func Spider(u string, num int) {
 		} else {
 			path = "/"
 		}
+	} else { //####
+		re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
+		base := re.FindAllStringSubmatch(result, -1)
+		if len(base) > 0 {
+			pattern := "[^.\\/\\w]"
+			re, _ := regexp.Compile(pattern)
+			// 检查字符串是否包含匹配的字符
+			result := re.MatchString(base[0][1])
+			if !result { // 字符串中没有其他字符
+				if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发
+					judge_base = true
+					path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
+				} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" {  // base路径从上一级目录出发
+					judge_base = true
+					pattern := "^[./]+$"
+					matched, _ := regexp.MatchString(pattern, base[0][1])
+					if matched { 	// 仅处理的base路径中只有 ./ 的
+						path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+					} else {
+						find_str := ""
+						if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
+							find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
+						} else {
+							find_str = base[0][1][3:]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1][3:]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http
+					judge_base = true
+					path = base[0][1]
+				} else if len(base[0][1]) > 0 {
+					judge_base = true
+					if base[0][1][0] == 47 { //base路径从根目录出发
+						path = base[0][1]
+					} else { //base路径未指明从哪路出发
+						find_str := ""
+						if strings.Contains(base[0][1], "/") {
+							find_str = base[0][1][:strings.Index(base[0][1], "/")]
+						} else {
+							find_str = base[0][1]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				}
+				if !strings.HasSuffix(path, "/") {
+					path += "/"
+				}
+			}
+		}
 	}
+
 	is = false
 	<-config.Ch
 	//提取js
-	jsFind(result, host, scheme, path, u, num)
+	jsFind(result, host, scheme, path, u, num, judge_base)
 	//提取url
-	urlFind(result, host, scheme, path, u, num)
+	urlFind(result, host, scheme, path, u, num, judge_base)
+	// 防止base判断错误
+	if judge_base {
+		jsFind(result, host, scheme, path, u, num, false)
+		urlFind(result, host, scheme, path, u, num, false)
+	}
 	//提取信息
 	infoFind(result, source)
 
@@ -142,8 +190,8 @@ func Spider(u string, num int) {
 // 打印Validate进度
 func PrintProgress() {
 	config.Mux.Lock()
-	num := len(result.ResultJs) + len(result.ResultUrl)
-	fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
+	// num := len(result.ResultJs) + len(result.ResultUrl)
+	// fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
 	config.Progress++
 	config.Mux.Unlock()
 }

From 5bfd2ff117ccd59733623a4ed3e20622a948e2b1 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 14:56:09 +0800
Subject: [PATCH 02/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者对于“base标签”的判断，仅是判断HTML代码中是否存在base标签。
但是，“base标签” 并不一定以标签的形式（<base href="xxx"/>）出现在HTML代码之中，有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中，例如：（base: "../script/"；baseUrl: './'；BASEURL="/baseProj/"；basePath = "../../"；）。
我添加了一段代码，用来判断以变量赋值的形式存在的 “base标签” 。
---
 crawler/crawler.go | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index 16f1d56..7d7f91f 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -2,6 +2,7 @@ package crawler
 
 import (
 	"compress/gzip"
+	"fmt"
 	"io"
 	"net/http"
 	"net/url"
@@ -10,6 +11,7 @@ import (
 
 	"github.com/pingc0y/URLFinder/cmd"
 	"github.com/pingc0y/URLFinder/config"
+	"github.com/pingc0y/URLFinder/result"
 	"github.com/pingc0y/URLFinder/util"
 )
 
@@ -24,7 +26,7 @@ func Spider(u string, num int) {
 
 	}()
 	config.Mux.Lock()
-	// fmt.Printf("\rStart %d Spider...", config.Progress)
+	fmt.Printf("\rStart %d Spider...", config.Progress)
 	config.Progress++
 	config.Mux.Unlock()
 	//标记完成
@@ -120,15 +122,15 @@ func Spider(u string, num int) {
 			re, _ := regexp.Compile(pattern)
 			// 检查字符串是否包含匹配的字符
 			result := re.MatchString(base[0][1])
-			if !result { // 字符串中没有其他字符
-				if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base路径从当前目录出发
+			if !result { // 字符串中没有其他特殊字符
+				if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发
 					judge_base = true
 					path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
-				} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" {  // base路径从上一级目录出发
+				} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发
 					judge_base = true
 					pattern := "^[./]+$"
 					matched, _ := regexp.MatchString(pattern, base[0][1])
-					if matched { 	// 仅处理的base路径中只有 ./ 的
+					if matched { // 处理的 base 路径中只有 ./的
 						path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
 					} else {
 						find_str := ""
@@ -143,14 +145,14 @@ func Spider(u string, num int) {
 							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
 						}
 					}
-				} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { //目录从http
+				} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议
 					judge_base = true
 					path = base[0][1]
 				} else if len(base[0][1]) > 0 {
 					judge_base = true
-					if base[0][1][0] == 47 { //base路径从根目录出发
+					if base[0][1][0] == 47 { //base 路径从根目录出发
 						path = base[0][1]
-					} else { //base路径未指明从哪路出发
+					} else { //base 路径未指明从哪出发
 						find_str := ""
 						if strings.Contains(base[0][1], "/") {
 							find_str = base[0][1][:strings.Index(base[0][1], "/")]
@@ -190,8 +192,8 @@ func Spider(u string, num int) {
 // 打印Validate进度
 func PrintProgress() {
 	config.Mux.Lock()
-	// num := len(result.ResultJs) + len(result.ResultUrl)
-	// fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
+	num := len(result.ResultJs) + len(result.ResultUrl)
+	fmt.Printf("\rValidate %.0f%%", float64(config.Progress+1)/float64(num+1)*100)
 	config.Progress++
 	config.Mux.Unlock()
 }

From 9038dff18baff642938198705f70a6c97882407c Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:02:29 +0800
Subject: [PATCH 03/14] Update find.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在 jsFind 和 urlFind 两个函数中添加了一段代码，用来将 ”base 标签“ 和 url 合并在一起。
---
 crawler/find.go | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/crawler/find.go b/crawler/find.go
index 3e06e2e..85cf6a8 100644
--- a/crawler/find.go
+++ b/crawler/find.go
@@ -1,16 +1,17 @@
 package crawler
 
 import (
+	"regexp"
+	"strings"
+
 	"github.com/pingc0y/URLFinder/cmd"
 	"github.com/pingc0y/URLFinder/config"
 	"github.com/pingc0y/URLFinder/mode"
 	"github.com/pingc0y/URLFinder/result"
-	"regexp"
-	"strings"
 )
 
 // 分析内容中的js
-func jsFind(cont, host, scheme, path, source string, num int) {
+func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) {
 	var cata string
 	care := regexp.MustCompile("/.*/{1}|/")
 	catae := care.FindAllString(path, -1)
@@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) {
 			if js[0] == "" {
 				continue
 			}
+
+			// base标签的处理 ####
+			if judge_base {
+				js[0] = path + js[0]
+			}
+
 			if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") {
 				switch AppendJs(js[0], source) {
 				case 0:
@@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) {
 }
 
 // 分析内容中的url
-func urlFind(cont, host, scheme, path, source string, num int) {
+func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) {
 	var cata string
 	care := regexp.MustCompile("/.*/{1}|/")
 	catae := care.FindAllString(path, -1)
@@ -104,6 +111,7 @@ func urlFind(cont, host, scheme, path, source string, num int) {
 	} else {
 		cata = catae[0]
 	}
+
 	host = scheme + "://" + host
 
 	//url匹配正则
@@ -111,7 +119,6 @@ func urlFind(cont, host, scheme, path, source string, num int) {
 	for _, re := range config.UrlFind {
 		reg := regexp.MustCompile(re)
 		urls := reg.FindAllStringSubmatch(cont, -1)
-		//fmt.Println(urls)
 		urls = urlFilter(urls)
 
 		//循环提取url放到结果中
@@ -119,6 +126,12 @@ func urlFind(cont, host, scheme, path, source string, num int) {
 			if url[0] == "" {
 				continue
 			}
+
+			// base标签的处理 ####
+			if judge_base {
+				url[0] = path + url[0]
+			}
+
 			if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") {
 				switch AppendUrl(url[0], source) {
 				case 0:

From 12175603f5f3cb9ab6851e1e028dd1948c738e7b Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:16:17 +0800
Subject: [PATCH 04/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者仅是使用requests库获取页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改：
（1）使用 go_rod 库来动态的获取页面加载过程中所有事件的响应体（包括页面源代码）
（2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件）
（3）去除了 探测非本ip下url 的功能，因为非本ip下url的数据冗余，绝大多数数据与本ip无关。若需要此功能，可以在AppendJs和AppendUrl下的代码中删除小段代码（我已注释）。
---
 crawler/run.go | 311 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 294 insertions(+), 17 deletions(-)

diff --git a/crawler/run.go b/crawler/run.go
index baa0cc2..fafb5ef 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -3,25 +3,35 @@ package crawler
 import (
 	"bufio"
 	"crypto/tls"
+	"encoding/json"
 	"flag"
 	"fmt"
-	"github.com/pingc0y/URLFinder/cmd"
-	"github.com/pingc0y/URLFinder/config"
-	"github.com/pingc0y/URLFinder/mode"
-	"github.com/pingc0y/URLFinder/result"
-	"github.com/pingc0y/URLFinder/util"
 	"io"
 	"net"
 	"net/http"
 	"net/url"
 	"os"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
+
+	"github.com/go-rod/rod"
+	"github.com/go-rod/rod/lib/launcher"
+	"github.com/go-rod/rod/lib/proto"
+	"github.com/pingc0y/URLFinder/cmd"
+	"github.com/pingc0y/URLFinder/config"
+	"github.com/pingc0y/URLFinder/mode"
+	"github.com/pingc0y/URLFinder/result"
+	"github.com/pingc0y/URLFinder/util"
 )
 
 var client *http.Client
 
+// 全局变量 存储body
+var ResBodyMap = make(map[string]string, 0)
+var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0)
+
 func load() {
 
 	if cmd.I {
@@ -177,13 +187,23 @@ func ValidateFF() {
 		for i, s := range result.ResultJs {
 			config.Wg.Add(1)
 			config.Jsch <- 1
-			go JsState(s.Url, i, result.ResultJs[i].Source)
+			// 判断响应数据是否已经在页面加载过程存储
+			rod_flag := false
+			if len(ResBodyMap[s.Url]) != 0 {
+				rod_flag = true
+			}
+			go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag)
 		}
 		//验证URL状态
 		for i, s := range result.ResultUrl {
 			config.Wg.Add(1)
 			config.Urlch <- 1
-			go UrlState(s.Url, i)
+			// 判断响应数据是否已经在页面加载过程存储
+			rod_flag := false
+			if len(ResBodyMap[s.Url]) != 0 {
+				rod_flag = true
+			}
+			go UrlState(s.Url, i, rod_flag)
 		}
 		config.Wg.Wait()
 
@@ -199,14 +219,245 @@ func ValidateFF() {
 	AddSource()
 }
 
+// 定义函数 url_parse，参数是一个字符串 u，返回值是三个字符串
+func url_parse(u string) (string, string, string) {
+	// 解析 u 为一个 URL 对象
+	u_str, err := url.Parse(u)
+	// 如果解析出错，就返回空字符串
+	if err != nil {
+		return "", "", ""
+	}
+	// 获取 URL 对象的 host、scheme、path 属性
+	host := u_str.Host
+	scheme := u_str.Scheme
+	path := u_str.Path
+	// 返回这三个属性的值
+	return host, scheme, path
+}
+
+// 提取响应体中的 Base 标签信息
+func extractBase(host, scheme, path, result string) (string, string, string, bool) {
+	judge_base := false
+	//处理base标签
+	re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
+	base := re.FindAllStringSubmatch(result, -1)
+	if len(base) > 0 {
+		host = regexp.MustCompile("http.*?//([^/]+)").FindAllStringSubmatch(base[0][1], -1)[0][1]
+		scheme = regexp.MustCompile("(http.*?)://").FindAllStringSubmatch(base[0][1], -1)[0][1]
+		paths := regexp.MustCompile("http.*?//.*?(/.*)").FindAllStringSubmatch(base[0][1], -1)
+		if len(paths) > 0 {
+			path = paths[0][1]
+		} else {
+			path = "/"
+		}
+	} else { //####
+		re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
+		base := re.FindAllStringSubmatch(result, -1)
+		if len(base) > 0 {
+			pattern := "[^.\\/\\w]"
+			re, _ := regexp.Compile(pattern)
+			// 检查字符串是否包含匹配的字符
+			result := re.MatchString(base[0][1])
+			if !result { // 字符串中没有其他特殊字符
+				if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base 路径从当前目录出发
+					judge_base = true
+					path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
+				} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base 路径从上一级目录出发
+					judge_base = true
+					pattern := "^[./]+$"
+					matched, _ := regexp.MatchString(pattern, base[0][1])
+					if matched { // 处理的 base 路径中只有 ./的
+						path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+					} else {
+						find_str := ""
+						if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
+							find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
+						} else {
+							find_str = base[0][1][3:]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1][3:]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base 标签包含协议
+					judge_base = true
+					path = base[0][1]
+				} else if len(base[0][1]) > 0 {
+					judge_base = true
+					if base[0][1][0] == 47 { //base 路径从根目录出发
+						path = base[0][1]
+					} else { //base 路径未指明从哪路出发
+						find_str := ""
+						if strings.Contains(base[0][1], "/") {
+							find_str = base[0][1][:strings.Index(base[0][1], "/")]
+						} else {
+							find_str = base[0][1]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				}
+				if !strings.HasSuffix(path, "/") {
+					path += "/"
+				}
+			}
+		}
+	}
+	return host, scheme, path, judge_base
+}
+
+// 获取网页加载的事件的响应体
+func rod_spider(u string, num int) {
+	//初始化浏览器
+	launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
+		NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
+		Set("no-default-browser-check").Set("disable-dev-shm-usage").
+		Set("disable-plugins").MustLaunch()
+	browser := rod.New().ControlURL(launch).MustConnect()
+
+	//添加关闭
+	defer browser.Close()
+
+	// 设置浏览器的证书错误处理，忽略所有证书错误
+	browser.MustIgnoreCertErrors(true)
+
+	// 设置浏览器打开的页面
+	pageTarget := proto.TargetCreateTarget{URL: u}
+	page, err := browser.Page(pageTarget)
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	// 在最后关闭页面
+	defer func() {
+		err := page.Close()
+		if err != nil {
+			// 处理错误
+			fmt.Println(err)
+		}
+	}()
+
+	// 设置页面的超时时间为 40 秒
+	page = page.Timeout(40 * time.Second)
+
+	// 创建一个空的 map，键是 proto.NetworkRequestID 类型，值是 string 类型
+	requestMap := make(map[string]string, 0)
+
+	// 使用 go 语句开启一个协程，在协程中处理页面的一些事件
+	go page.EachEvent(func(e *proto.PageJavascriptDialogOpening) {
+		// 处理 JavaScript 对话框
+		_ = proto.PageHandleJavaScriptDialog{Accept: true, PromptText: ""}.Call(page)
+	}, func(e *proto.NetworkResponseReceived) {
+		// 获取请求的 ID 和 URL
+		ResponseURL := e.Response.URL
+		// fmt.Println(e.Response.URL, e.RequestID)
+		ResHeaderMap[ResponseURL] = e.Response.Headers
+
+		// 在 requestMap 中填充数据
+		requestMap[ResponseURL] = ""
+
+	})()
+
+	// 等待页面加载完成，并处理可能出现的错误
+	pageLoadErr := page.WaitLoad()
+	if pageLoadErr != nil {
+		fmt.Println(pageLoadErr)
+	}
+
+	// 等待页面的 DOM 结构稳定
+	page.WaitStable(2 * time.Second)
+
+	// 打印页面源码
+	htmlStr, err := page.HTML()
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	for url, _ := range requestMap {
+		// 调用 page.GetResource 方法来获取响应体
+		ResponseBody, _ := page.GetResource(url)
+		requestMap[url] = string(ResponseBody)
+	}
+
+	// 存储页面源码
+	requestMap[u] = string(htmlStr)
+	// fmt.Println(requestMap[u])
+
+	// 遍历响应体，提取 Base 标签、提取 js 、提取 url 、
+	for url, body := range requestMap {
+		// 判断响应体是否为空
+		if len(body) == 0 {
+			continue
+		}
+
+		// 遍历 BodyFiler 切片中的每个元素
+		re := regexp.MustCompile("\\.jpeg\\?|\\.jpg\\?|\\.png\\?|.gif\\?|www\\.w3\\.org|example\\.com|.*,$|.*\\.jpeg$|.*\\.jpg$|.*\\.png$|.*\\.gif$|.*\\.ico$|.*\\.svg$|.*\\.vue$|.*\\.ts$")
+		if re.MatchString(url) {
+			continue
+		}
+
+		// fmt.Println("目标url及响应体信息:  ", url, len(body))
+
+		// 添加body数据
+		ResBodyMap[url] = body
+
+		// 将响应头数据转换成map存储
+		Res_header := make(map[string]string, 0)
+		if len(ResHeaderMap[url]) != 0 {
+			data, err := json.Marshal(ResHeaderMap[url])
+			if err != nil {
+				fmt.Println(err)
+			}
+			err = json.Unmarshal(data, &Res_header)
+			if err != nil {
+				fmt.Println(err)
+			}
+		}
+
+		// 添加首页动态加载的数据
+		if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") {
+			result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
+			// AppendJs(url, u)
+		} else {
+			result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
+			// AppendUrl(url, u)
+		}
+
+		host, scheme, path := url_parse(url)
+
+		judge_base := false
+		host, scheme, path, judge_base = extractBase(host, scheme, path, body)
+
+		//提取js
+		jsFind(body, host, scheme, path, u, num, judge_base)
+		//提取url
+		urlFind(body, host, scheme, path, u, num, judge_base)
+		// 防止base判断错误
+		if judge_base {
+			jsFind(body, host, scheme, path, u, num, false)
+			urlFind(body, host, scheme, path, u, num, false)
+		}
+
+	}
+
+}
+
 func start(u string) {
 	fmt.Println("Target URL: " + u)
-	config.Wg.Add(1)
-	config.Ch <- 1
-	go Spider(u, 1)
-	config.Wg.Wait()
-	config.Progress = 1
-	fmt.Printf("\r\nSpider OK \n")
+
+	// config.Wg.Add(1)
+	// config.Ch <- 1
+	// go Spider(u, 1) // ###
+	rod_spider(u, 1)
+	// config.Wg.Wait()
+	// config.Progress = 1
+
+	fmt.Printf("\r\nRod_Spider OK \n")
 	result.ResultUrl = util.RemoveRepeatElement(result.ResultUrl)
 	result.ResultJs = util.RemoveRepeatElement(result.ResultJs)
 	if cmd.S != "" {
@@ -217,13 +468,23 @@ func start(u string) {
 		for i, s := range result.ResultJs {
 			config.Wg.Add(1)
 			config.Jsch <- 1
-			go JsState(s.Url, i, result.ResultJs[i].Source)
+			// 判断响应数据是否已经在页面加载过程存储
+			rod_flag := false
+			if len(ResBodyMap[s.Url]) != 0 {
+				rod_flag = true
+			}
+			go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag)
 		}
 		//验证URL状态
 		for i, s := range result.ResultUrl {
 			config.Wg.Add(1)
 			config.Urlch <- 1
-			go UrlState(s.Url, i)
+			// 判断响应数据是否已经在页面加载过程存储
+			rod_flag := false
+			if len(ResBodyMap[s.Url]) != 0 {
+				rod_flag = true
+			}
+			go UrlState(s.Url, i, rod_flag)
 		}
 		config.Wg.Wait()
 
@@ -241,7 +502,7 @@ func start(u string) {
 
 func Res() {
 	if len(result.ResultJs) == 0 && len(result.ResultUrl) == 0 {
-		fmt.Println("未获取到数据")
+		fmt.Println(os.Stdout, cmd.U, "Data not captured")
 		return
 	}
 	//打印还是输出
@@ -273,11 +534,20 @@ func AppendJs(ur string, urltjs string) int {
 	if err != nil {
 		return 2
 	}
+
+	// 过滤其他ip ####
+	host1, _, _ := url_parse(ur)
+	host2, _, _ := url_parse(urltjs)
+	if host1 != host2 {
+		return 2
+	}
+
 	for _, eachItem := range result.ResultJs {
 		if eachItem.Url == ur {
 			return 0
 		}
 	}
+
 	result.ResultJs = append(result.ResultJs, mode.Link{Url: ur})
 	if strings.HasSuffix(urltjs, ".js") {
 		result.Jsinurl[ur] = result.Jsinurl[urltjs]
@@ -301,6 +571,14 @@ func AppendUrl(ur string, urlturl string) int {
 	if err != nil {
 		return 2
 	}
+
+	// 过滤其他ip ####
+	host1, _, _ := url_parse(ur)
+	host2, _, _ := url_parse(urlturl)
+	if host1 != host2 {
+		return 2
+	}
+
 	for _, eachItem := range result.ResultUrl {
 		if eachItem.Url == ur {
 			return 0
@@ -383,5 +661,4 @@ func Initialization() {
 	result.Jstourl = make(map[string]string)
 	result.Urltourl = make(map[string]string)
 	result.Redirect = make(map[string]bool)
-
 }

From bdb85cd457d95b77b648e0666f10550e67d0722c Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:19:50 +0800
Subject: [PATCH 05/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者只是利用requests库获取页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改： （1）使用go_rod库来动态的获取页面加载过程中所有事件的响应体（包括页面源代码） （2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件） （3）去除了探测非ip下url的功能，因为非本ip下url的数据冗长冗余，且大部分数据与本ip无关。若需要此功能，可以在AppendJs和AppendUrl下的代码中删除小段代码（我已添加注释）。
---
 crawler/run.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawler/run.go b/crawler/run.go
index fafb5ef..49edf4e 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -238,7 +238,7 @@ func url_parse(u string) (string, string, string) {
 // 提取响应体中的 Base 标签信息
 func extractBase(host, scheme, path, result string) (string, string, string, bool) {
 	judge_base := false
-	//处理base标签
+	// 处理base标签
 	re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
 	base := re.FindAllStringSubmatch(result, -1)
 	if len(base) > 0 {
@@ -250,7 +250,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
 		} else {
 			path = "/"
 		}
-	} else { //####
+	} else { // 处理 "base 标签"
 		re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
 		base := re.FindAllStringSubmatch(result, -1)
 		if len(base) > 0 {

From b02ec1cdec92c1041f87f2e11250e0b2b89b3463 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:23:02 +0800
Subject: [PATCH 06/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者使用requests库，仅获取了页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改： （1）使用go_rod库来动态的获取页面加载过程中所有事件的响应体（包括页面源代码）（2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件） （3）去除_探测非ip下url_的功能，因为非本ip下url的数据大部分与本ip无关。若需要此功能，可以在AppendJs和AppendUrl下的代码中删除小段代码（我已注释）。
---
 crawler/run.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawler/run.go b/crawler/run.go
index 49edf4e..5340c0f 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -313,14 +313,14 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
 
 // 获取网页加载的事件的响应体
 func rod_spider(u string, num int) {
-	//初始化浏览器
+	// 初始化浏览器
 	launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
 		NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
 		Set("no-default-browser-check").Set("disable-dev-shm-usage").
 		Set("disable-plugins").MustLaunch()
 	browser := rod.New().ControlURL(launch).MustConnect()
 
-	//添加关闭
+	// 添加关闭
 	defer browser.Close()
 
 	// 设置浏览器的证书错误处理，忽略所有证书错误

From 90d26fef8816b7b83c920d0ad1e54866ce6c9e48 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:32:25 +0800
Subject: [PATCH 07/14] Update state.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在作者代码基础上，（1）增加了请求头的选项（2）增加了存储响应头和响应体的功能
---
 crawler/state.go | 68 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/crawler/state.go b/crawler/state.go
index 773946f..6034591 100644
--- a/crawler/state.go
+++ b/crawler/state.go
@@ -1,27 +1,34 @@
 package crawler
 
 import (
-	"github.com/pingc0y/URLFinder/cmd"
-	"github.com/pingc0y/URLFinder/config"
-	"github.com/pingc0y/URLFinder/mode"
-	"github.com/pingc0y/URLFinder/result"
-	"github.com/pingc0y/URLFinder/util"
 	"io"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strconv"
 	"strings"
+
+	"github.com/pingc0y/URLFinder/cmd"
+	"github.com/pingc0y/URLFinder/config"
+	"github.com/pingc0y/URLFinder/mode"
+	"github.com/pingc0y/URLFinder/result"
+	"github.com/pingc0y/URLFinder/util"
 )
 
 // 检测js访问状态码
-func JsState(u string, i int, sou string) {
+func JsState(u string, i int, sou string, rod_flag bool) {
 
 	defer func() {
 		config.Wg.Done()
 		<-config.Jsch
 		PrintProgress()
 	}()
+
+	// 首页动态加载的数据 已经存储
+	if rod_flag {
+		return
+	}
+
 	if cmd.S == "" {
 		result.ResultJs[i].Url = u
 		return
@@ -53,6 +60,12 @@ func JsState(u string, i int, sou string) {
 	//增加header选项
 	request.Header.Set("User-Agent", util.GetUserAgent())
 	request.Header.Set("Accept", "*/*")
+	u_str, err := url.Parse(u)
+	if err != nil {
+		return
+	}
+	request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
+
 	//加载yaml配置
 	if cmd.I {
 		util.SetHeadersConfig(&request.Header)
@@ -99,25 +112,41 @@ func JsState(u string, i int, sou string) {
 		} else {
 			length = len(dataBytes)
 		}
+
+		res_body := string(dataBytes)
+		res_headers := make(map[string]string, 0)
+		// 遍历响应头中的所有键值对
+		for k, v := range response.Header {
+			// 如果值是一个切片，取第一个元素作为值，否则忽略该键值对
+			if len(v) > 0 {
+				res_headers[k] = v[0]
+			}
+		}
+
 		config.Lock.Lock()
 		if result.Redirect[ur.String()] {
 			code = 302
 			redirect = response.Request.URL.String()
 		}
 		config.Lock.Unlock()
-		result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect}
+		result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
 	} else {
 		result.ResultJs[i].Url = ""
 	}
 }
 
 // 检测url访问状态码
-func UrlState(u string, i int) {
+func UrlState(u string, i int, rod_flag bool) {
 	defer func() {
 		config.Wg.Done()
 		<-config.Urlch
 		PrintProgress()
 	}()
+
+	// 首页动态加载的数据 已经存储
+	if rod_flag {
+		return
+	}
 	if cmd.S == "" {
 		result.ResultUrl[i].Url = u
 		return
@@ -148,6 +177,11 @@ func UrlState(u string, i int) {
 	//增加header选项
 	request.Header.Set("User-Agent", util.GetUserAgent())
 	request.Header.Set("Accept", "*/*")
+	u_str, err := url.Parse(u)
+	if err != nil {
+		return
+	}
+	request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
 
 	//加载yaml配置
 	if cmd.I {
@@ -194,9 +228,19 @@ func UrlState(u string, i int) {
 		} else {
 			length = len(dataBytes)
 		}
-		body := string(dataBytes)
+
+		res_body := string(dataBytes)
+		res_headers := make(map[string]string, 0)
+		// 遍历响应头中的所有键值对
+		for k, v := range response.Header {
+			// 如果值是一个切片，取第一个元素作为值，否则忽略该键值对
+			if len(v) > 0 {
+				res_headers[k] = v[0]
+			}
+		}
+
 		re := regexp.MustCompile("<[tT]itle>(.*?)</[tT]itle>")
-		title := re.FindAllStringSubmatch(body, -1)
+		title := re.FindAllStringSubmatch(res_body, -1)
 		config.Lock.Lock()
 		if result.Redirect[ur.String()] {
 			code = 302
@@ -205,9 +249,9 @@ func UrlState(u string, i int) {
 		config.Lock.Unlock()
 
 		if len(title) != 0 {
-			result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect}
+			result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
 		} else {
-			result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect}
+			result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body}
 		}
 	} else {
 		result.ResultUrl[i].Url = ""

From 8cbaae1cc9d671bc58cdb2927f7d4eb987286830 Mon Sep 17 00:00:00 2001
From: LZH <128961083+DongfengMissile@users.noreply.github.com>
Date: Sat, 21 Oct 2023 15:34:14 +0800
Subject: [PATCH 08/14] Update mode.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在type Link struct中添加了ResponseHeaders和ResponseBody两个字段
---
 mode/mode.go | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mode/mode.go b/mode/mode.go
index 300e702..454c7b2 100644
--- a/mode/mode.go
+++ b/mode/mode.go
@@ -18,12 +18,14 @@ type Config struct {
 }
 
 type Link struct {
-	Url      string
-	Status   string
-	Size     string
-	Title    string
-	Redirect string
-	Source   string
+	Url             string
+	Status          string
+	Size            string
+	Title           string
+	Redirect        string
+	Source          string
+	ResponseHeaders map[string]string
+	ResponseBody    string
 }
 
 type Info struct {

From 4e775e045bce63c693e3840b2e0e4d9c33dae4da Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 17:28:25 +0800
Subject: [PATCH 09/14] Update crawler.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者对于“base标签”的判断，仅是判断HTML代码中是否存在base标签。
但是，“base标签” 并不一定以标签的形式（<base href="xxx"/>）出现在HTML代码之中，有一些时候 “base标签” 会以变量赋值的形式出现在HTML代码中，例如：（base: "../script/"；baseUrl: './'；BASEURL="/baseProj/"；basePath = "../../"；）。
我添加了一段代码，用来判断以变量赋值的形式存在的 “base标签” 。
此外，许多页面的请求头中若没有Referer选项会得不到正确的响应结果或者直接访问不了该页面，我在请求头的设置中添加了Referer选项。
---
 crawler/crawler.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index 7d7f91f..a0113c2 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -114,7 +114,7 @@ func Spider(u string, num int) {
 		} else {
 			path = "/"
 		}
-	} else { //####
+	} else { // 处理 "base 标签"
 		re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
 		base := re.FindAllStringSubmatch(result, -1)
 		if len(base) > 0 {

From 98d1e9cdf5f60f959b98a909b0583df36e61ce8a Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 17:40:24 +0800
Subject: [PATCH 10/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者使用requests库，仅获取了页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改：
（1）使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体（包括页面源代码）
（2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件）
---
 crawler/run.go | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/crawler/run.go b/crawler/run.go
index 5340c0f..48c0aaa 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -28,7 +28,7 @@ import (
 
 var client *http.Client
 
-// 全局变量 存储body
+// 用来存储响应体和响应头数据
 var ResBodyMap = make(map[string]string, 0)
 var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0)
 
@@ -337,7 +337,6 @@ func rod_spider(u string, num int) {
 	defer func() {
 		err := page.Close()
 		if err != nil {
-			// 处理错误
 			fmt.Println(err)
 		}
 	}()
@@ -401,8 +400,6 @@ func rod_spider(u string, num int) {
 			continue
 		}
 
-		// fmt.Println("目标url及响应体信息:  ", url, len(body))
-
 		// 添加body数据
 		ResBodyMap[url] = body
 
@@ -422,10 +419,8 @@ func rod_spider(u string, num int) {
 		// 添加首页动态加载的数据
 		if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") {
 			result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
-			// AppendJs(url, u)
 		} else {
 			result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body})
-			// AppendUrl(url, u)
 		}
 
 		host, scheme, path := url_parse(url)
@@ -535,13 +530,6 @@ func AppendJs(ur string, urltjs string) int {
 		return 2
 	}
 
-	// 过滤其他ip ####
-	host1, _, _ := url_parse(ur)
-	host2, _, _ := url_parse(urltjs)
-	if host1 != host2 {
-		return 2
-	}
-
 	for _, eachItem := range result.ResultJs {
 		if eachItem.Url == ur {
 			return 0
@@ -572,13 +560,6 @@ func AppendUrl(ur string, urlturl string) int {
 		return 2
 	}
 
-	// 过滤其他ip ####
-	host1, _, _ := url_parse(ur)
-	host2, _, _ := url_parse(urlturl)
-	if host1 != host2 {
-		return 2
-	}
-
 	for _, eachItem := range result.ResultUrl {
 		if eachItem.Url == ur {
 			return 0

From 1646bb9b474f0f1c8a5a1bdc48c9c38814a526cc Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:08:31 +0800
Subject: [PATCH 11/14] Update filter.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

url.QueryUnescape()这个函数的使用在某些时候会出现错误；
例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码，输出结果会是空字符串，将导致许多url访问失败。
我添加了小段代码，做出了简单的判断。
---
 crawler/filter.go | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/crawler/filter.go b/crawler/filter.go
index e0b5ebe..e4e9f0d 100644
--- a/crawler/filter.go
+++ b/crawler/filter.go
@@ -1,10 +1,11 @@
 package crawler
 
 import (
-	"github.com/pingc0y/URLFinder/config"
 	"net/url"
 	"regexp"
 	"strings"
+
+	"github.com/pingc0y/URLFinder/config"
 )
 
 // 过滤JS
@@ -12,6 +13,11 @@ func jsFilter(str [][]string) [][]string {
 
 	//对不需要的数据过滤
 	for i := range str {
+		if strings.Contains(str[i][1], "%s%s:%s") {
+			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+		}
+
 		str[i][0], _ = url.QueryUnescape(str[i][1])
 		str[i][0] = strings.TrimSpace(str[i][0])
 		str[i][0] = strings.Replace(str[i][0], " ", "", -1)
@@ -44,12 +50,18 @@ func urlFilter(str [][]string) [][]string {
 
 	//对不需要的数据过滤
 	for i := range str {
+
+		if strings.Contains(str[i][1], "%s%s:%s") {
+			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+		}
 		str[i][0], _ = url.QueryUnescape(str[i][1])
 		str[i][0] = strings.TrimSpace(str[i][0])
 		str[i][0] = strings.Replace(str[i][0], " ", "", -1)
 		str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1)
 		str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1)
 		str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1)
+
 		//去除不存在字符串和数字的url,判断为错误数据
 		match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0])
 		if !match {

From 1b3b6c9df16956699db3a06707fe6576373df2b8 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:13:28 +0800
Subject: [PATCH 12/14] Update filter.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

url.QueryUnescape()这个函数的使用在某些时候会出现错误；
例如 str, _ = url.QueryUnescape("%s%s:%s/ABC/") 这样一行代码，将出现错误，str是一个空字符串，这将导致许多url访问失败。
我添加了小段代码，做出了简单的判断。
---
 crawler/filter.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crawler/filter.go b/crawler/filter.go
index e4e9f0d..e27281d 100644
--- a/crawler/filter.go
+++ b/crawler/filter.go
@@ -11,8 +11,9 @@ import (
 // 过滤JS
 func jsFilter(str [][]string) [][]string {
 
-	//对不需要的数据过滤
+	// 对不需要的数据过滤
 	for i := range str {
+		// 针对QueryUnescape函数做出了简单的预先处理
 		if strings.Contains(str[i][1], "%s%s:%s") {
 			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
 			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
@@ -50,7 +51,7 @@ func urlFilter(str [][]string) [][]string {
 
 	//对不需要的数据过滤
 	for i := range str {
-
+		// 针对QueryUnescape函数做出了简单的预先处理
 		if strings.Contains(str[i][1], "%s%s:%s") {
 			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
 			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)

From f8ee1af95799bd8001e4d29d21500a821e3e9677 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:23:08 +0800
Subject: [PATCH 13/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者使用requests库，仅获取了页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改：
（1）使用go_rod库进行无界面渲染获取页面加载过程中所有事件的响应体（包括页面源代码）
（2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件）
---
 crawler/run.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/run.go b/crawler/run.go
index 48c0aaa..8e3f1ad 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -314,7 +314,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
 // 获取网页加载的事件的响应体
 func rod_spider(u string, num int) {
 	// 初始化浏览器
-	launch := launcher.New().Headless(true).Set("test-type").Set("ignore-certificate-errors").
+	launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors").
 		NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
 		Set("no-default-browser-check").Set("disable-dev-shm-usage").
 		Set("disable-plugins").MustLaunch()

From e9faebe3c0f6f1ec3c40fcda1b040ceb55ac2e92 Mon Sep 17 00:00:00 2001
From: LZH <128961083+Liiu04@users.noreply.github.com>
Date: Sat, 21 Oct 2023 18:46:38 +0800
Subject: [PATCH 14/14] Update run.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

作者使用requests库，仅获取了页面的源代码，然后进行深度探测，这样获取的数据有限。
我在作者代码的基础上，做了如下修改：
（1）使用go_rod库进行浏览器渲染获取页面加载过程中所有事件的响应体（包括页面源代码）
（2）保存所有响应事件的响应体和响应头（除.jpg、.svg等格式的文件）
---
 crawler/run.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/run.go b/crawler/run.go
index 8e3f1ad..d21d996 100644
--- a/crawler/run.go
+++ b/crawler/run.go
@@ -313,7 +313,7 @@ func extractBase(host, scheme, path, result string) (string, string, string, boo
 
 // 获取网页加载的事件的响应体
 func rod_spider(u string, num int) {
-	// 初始化浏览器
+	// 初始化浏览器，无头浏览器：Headless(true)
 	launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors").
 		NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito").
 		Set("no-default-browser-check").Set("disable-dev-shm-usage").