pingc0y · Liiu04 · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023
diff --git a/crawler/crawler.go b/crawler/crawler.go
@@ -3,15 +3,16 @@ package crawler
 import (
 	"compress/gzip"
 	"fmt"
-	"github.com/pingc0y/URLFinder/cmd"
-	"github.com/pingc0y/URLFinder/config"
-	"github.com/pingc0y/URLFinder/result"
-	"github.com/pingc0y/URLFinder/util"
 	"io"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strings"
+
+	"github.com/pingc0y/URLFinder/cmd"
+	"github.com/pingc0y/URLFinder/config"
+	"github.com/pingc0y/URLFinder/result"
+	"github.com/pingc0y/URLFinder/util"
 )
 
 // 蜘蛛抓取页面内容
@@ -53,6 +54,12 @@ func Spider(u string, num int) {
 	request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快
 	request.Header.Set("User-Agent", util.GetUserAgent())
 	request.Header.Set("Accept", "*/*")
+	u_str, err := url.Parse(u)
+	if err != nil {
+		return
+	}
+	request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####
+
 	//增加header选项
 	if cmd.C != "" {
 		request.Header.Set("Cookie", cmd.C)
@@ -62,27 +69,6 @@ func Spider(u string, num int) {
 		util.SetHeadersConfig(&request.Header)
 	}
 
-	//处理返回结果
-	//tr := &http.Transport{
-	//	TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
-	//}
-	//client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second,
-	//	Transport: tr,
-	//	CheckRedirect: func(req *http.Request, via []*http.Request) error {
-	//		if len(via) >= 10 {
-	//			return fmt.Errorf("Too many redirects")
-	//		}
-	//		if len(via) > 0 {
-	//			if via[0] != nil && via[0].URL != nil {
-	//				result.Redirect[via[0].URL.String()] = true
-	//			} else {
-	//				result.Redirect[req.URL.String()] = true
-	//			}
-	//
-	//		}
-	//		return nil
-	//	},
-	//}
 	response, err := client.Do(request)
 	if err != nil {
 		return
@@ -115,6 +101,7 @@ func Spider(u string, num int) {
 	host := response.Request.URL.Host
 	scheme := response.Request.URL.Scheme
 	source := scheme + "://" + host + path
+	judge_base := false //####
 	//处理base标签
 	re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
 	base := re.FindAllStringSubmatch(result, -1)
@@ -127,13 +114,76 @@ func Spider(u string, num int) {
 		} else {
 			path = "/"
 		}
+	} else { // 处理 "base 标签"
+		re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
+		base := re.FindAllStringSubmatch(result, -1)
+		if len(base) > 0 {
+			pattern := "[^.\\/\\w]"
+			re, _ := regexp.Compile(pattern)
+			// 检查字符串是否包含匹配的字符
+			result := re.MatchString(base[0][1])
+			if !result { // 字符串中没有其他特殊字符
+				if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发
+					judge_base = true
+					path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
+				} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发
+					judge_base = true
+					pattern := "^[./]+$"
+					matched, _ := regexp.MatchString(pattern, base[0][1])
+					if matched { // 处理的 base 路径中只有 ./的
+						path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+					} else {
+						find_str := ""
+						if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
+							find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
+						} else {
+							find_str = base[0][1][3:]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1][3:]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议
+					judge_base = true
+					path = base[0][1]
+				} else if len(base[0][1]) > 0 {
+					judge_base = true
+					if base[0][1][0] == 47 { //base 路径从根目录出发
+						path = base[0][1]
+					} else { //base 路径未指明从哪出发
+						find_str := ""
+						if strings.Contains(base[0][1], "/") {
+							find_str = base[0][1][:strings.Index(base[0][1], "/")]
+						} else {
+							find_str = base[0][1]
+						}
+						if strings.Contains(path, find_str) {
+							path = path[:strings.Index(path, find_str)] + base[0][1]
+						} else {
+							path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
+						}
+					}
+				}
+				if !strings.HasSuffix(path, "/") {
+					path += "/"
+				}
+			}
+		}
 	}
+
 	is = false
 	<-config.Ch
 	//提取js
-	jsFind(result, host, scheme, path, u, num)
+	jsFind(result, host, scheme, path, u, num, judge_base)
 	//提取url
-	urlFind(result, host, scheme, path, u, num)
+	urlFind(result, host, scheme, path, u, num, judge_base)
+	// 防止base判断错误
+	if judge_base {
+		jsFind(result, host, scheme, path, u, num, false)
+		urlFind(result, host, scheme, path, u, num, false)
+	}
 	//提取信息
 	infoFind(result, source)
 

diff --git a/crawler/filter.go b/crawler/filter.go
@@ -1,17 +1,24 @@
 package crawler
 
 import (
-	"github.com/pingc0y/URLFinder/config"
 	"net/url"
 	"regexp"
 	"strings"
+
+	"github.com/pingc0y/URLFinder/config"
 )
 
 // 过滤JS
 func jsFilter(str [][]string) [][]string {
 
-	//对不需要的数据过滤
+	// 对不需要的数据过滤
 	for i := range str {
+		// 针对QueryUnescape函数做出了简单的预先处理
+		if strings.Contains(str[i][1], "%s%s:%s") {
+			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+		}
+
 		str[i][0], _ = url.QueryUnescape(str[i][1])
 		str[i][0] = strings.TrimSpace(str[i][0])
 		str[i][0] = strings.Replace(str[i][0], " ", "", -1)
@@ -44,12 +51,18 @@ func urlFilter(str [][]string) [][]string {
 
 	//对不需要的数据过滤
 	for i := range str {
+		// 针对QueryUnescape函数做出了简单的预先处理
+		if strings.Contains(str[i][1], "%s%s:%s") {
+			str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
+			str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
+		}
 		str[i][0], _ = url.QueryUnescape(str[i][1])
 		str[i][0] = strings.TrimSpace(str[i][0])
 		str[i][0] = strings.Replace(str[i][0], " ", "", -1)
 		str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1)
 		str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1)
 		str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1)
+
 		//去除不存在字符串和数字的url,判断为错误数据
 		match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0])
 		if !match {

diff --git a/crawler/find.go b/crawler/find.go
@@ -1,16 +1,17 @@
 package crawler
 
 import (
+	"regexp"
+	"strings"
+
 	"github.com/pingc0y/URLFinder/cmd"
 	"github.com/pingc0y/URLFinder/config"
 	"github.com/pingc0y/URLFinder/mode"
 	"github.com/pingc0y/URLFinder/result"
-	"regexp"
-	"strings"
 )
 
 // 分析内容中的js
-func jsFind(cont, host, scheme, path, source string, num int) {
+func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) {
 	var cata string
 	care := regexp.MustCompile("/.*/{1}|/")
 	catae := care.FindAllString(path, -1)
@@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) {
 			if js[0] == "" {
 				continue
 			}
+
+			// base标签的处理 ####
+			if judge_base {
+				js[0] = path + js[0]
+			}
+
 			if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") {
 				switch AppendJs(js[0], source) {
 				case 0:
@@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) {
 }
 
 // 分析内容中的url
-func urlFind(cont, host, scheme, path, source string, num int) {
+func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) {
 	var cata string
 	care := regexp.MustCompile("/.*/{1}|/")
 	catae := care.FindAllString(path, -1)
@@ -104,21 +111,27 @@ func urlFind(cont, host, scheme, path, source string, num int) {
 	} else {
 		cata = catae[0]
 	}
+
 	host = scheme + "://" + host
 
 	//url匹配正则
 
 	for _, re := range config.UrlFind {
 		reg := regexp.MustCompile(re)
 		urls := reg.FindAllStringSubmatch(cont, -1)
-		//fmt.Println(urls)
 		urls = urlFilter(urls)
 
 		//循环提取url放到结果中
 		for _, url := range urls {
 			if url[0] == "" {
 				continue
 			}
+
+			// base标签的处理 ####
+			if judge_base {
+				url[0] = path + url[0]
+			}
+
 			if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") {
 				switch AppendUrl(url[0], source) {
 				case 0: