diff --git a/crawler/crawler.go b/crawler/crawler.go index 0a50caf..a0113c2 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -3,15 +3,16 @@ package crawler import ( "compress/gzip" "fmt" - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net/http" "net/url" "regexp" "strings" + + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/result" + "github.com/pingc0y/URLFinder/util" ) // 蜘蛛抓取页面内容 @@ -53,6 +54,12 @@ func Spider(u string, num int) { request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### + //增加header选项 if cmd.C != "" { request.Header.Set("Cookie", cmd.C) @@ -62,27 +69,6 @@ func Spider(u string, num int) { util.SetHeadersConfig(&request.Header) } - //处理返回结果 - //tr := &http.Transport{ - // TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - //} - //client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second, - // Transport: tr, - // CheckRedirect: func(req *http.Request, via []*http.Request) error { - // if len(via) >= 10 { - // return fmt.Errorf("Too many redirects") - // } - // if len(via) > 0 { - // if via[0] != nil && via[0].URL != nil { - // result.Redirect[via[0].URL.String()] = true - // } else { - // result.Redirect[req.URL.String()] = true - // } - // - // } - // return nil - // }, - //} response, err := client.Do(request) if err != nil { return @@ -115,6 +101,7 @@ func Spider(u string, num int) { host := response.Request.URL.Host scheme := response.Request.URL.Scheme source := scheme + "://" + host + path + judge_base := false //#### //处理base标签 re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]") base := re.FindAllStringSubmatch(result, -1) @@ -127,13 +114,76 @@ func Spider(u string, num int) { } else { path = "/" } + } else { // 处理 "base 标签" + re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + pattern := "[^.\\/\\w]" + re, _ := regexp.Compile(pattern) + // 检查字符串是否包含匹配的字符 + result := re.MatchString(base[0][1]) + if !result { // 字符串中没有其他特殊字符 + if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发 + judge_base = true + path = path[:strings.LastIndex(path, "/")] + base[0][1][1:] + } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发 + judge_base = true + pattern := "^[./]+$" + matched, _ := regexp.MatchString(pattern, base[0][1]) + if matched { // 处理的 base 路径中只有 ./的 + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } else { + find_str := "" + if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") { + find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3] + } else { + find_str = base[0][1][3:] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1][3:] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议 + judge_base = true + path = base[0][1] + } else if len(base[0][1]) > 0 { + judge_base = true + if base[0][1][0] == 47 { //base 路径从根目录出发 + path = base[0][1] + } else { //base 路径未指明从哪出发 + find_str := "" + if strings.Contains(base[0][1], "/") { + find_str = base[0][1][:strings.Index(base[0][1], "/")] + } else { + find_str = base[0][1] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + } + } } + is = false <-config.Ch //提取js - jsFind(result, host, scheme, path, u, num) + jsFind(result, host, scheme, path, u, num, judge_base) //提取url - urlFind(result, host, scheme, path, u, num) + urlFind(result, host, scheme, path, u, num, judge_base) + // 防止base判断错误 + if judge_base { + jsFind(result, host, scheme, path, u, num, false) + urlFind(result, host, scheme, path, u, num, false) + } //提取信息 infoFind(result, source) diff --git a/crawler/filter.go b/crawler/filter.go index e0b5ebe..e27281d 100644 --- a/crawler/filter.go +++ b/crawler/filter.go @@ -1,17 +1,24 @@ package crawler import ( - "github.com/pingc0y/URLFinder/config" "net/url" "regexp" "strings" + + "github.com/pingc0y/URLFinder/config" ) // 过滤JS func jsFilter(str [][]string) [][]string { - //对不需要的数据过滤 + // 对不需要的数据过滤 for i := range str { + // 针对QueryUnescape函数做出了简单的预先处理 + if strings.Contains(str[i][1], "%s%s:%s") { + str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) + str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) + } + str[i][0], _ = url.QueryUnescape(str[i][1]) str[i][0] = strings.TrimSpace(str[i][0]) str[i][0] = strings.Replace(str[i][0], " ", "", -1) @@ -44,12 +51,18 @@ func urlFilter(str [][]string) [][]string { //对不需要的数据过滤 for i := range str { + // 针对QueryUnescape函数做出了简单的预先处理 + if strings.Contains(str[i][1], "%s%s:%s") { + str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1) + str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1) + } str[i][0], _ = url.QueryUnescape(str[i][1]) str[i][0] = strings.TrimSpace(str[i][0]) str[i][0] = strings.Replace(str[i][0], " ", "", -1) str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1) str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1) str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1) + //去除不存在字符串和数字的url,判断为错误数据 match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0]) if !match { diff --git a/crawler/find.go b/crawler/find.go index 3e06e2e..85cf6a8 100644 --- a/crawler/find.go +++ b/crawler/find.go @@ -1,16 +1,17 @@ package crawler import ( + "regexp" + "strings" + "github.com/pingc0y/URLFinder/cmd" "github.com/pingc0y/URLFinder/config" "github.com/pingc0y/URLFinder/mode" "github.com/pingc0y/URLFinder/result" - "regexp" - "strings" ) // 分析内容中的js -func jsFind(cont, host, scheme, path, source string, num int) { +func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) { var cata string care := regexp.MustCompile("/.*/{1}|/") catae := care.FindAllString(path, -1) @@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) { if js[0] == "" { continue } + + // base标签的处理 #### + if judge_base { + js[0] = path + js[0] + } + if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") { switch AppendJs(js[0], source) { case 0: @@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) { } // 分析内容中的url -func urlFind(cont, host, scheme, path, source string, num int) { +func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) { var cata string care := regexp.MustCompile("/.*/{1}|/") catae := care.FindAllString(path, -1) @@ -104,6 +111,7 @@ func urlFind(cont, host, scheme, path, source string, num int) { } else { cata = catae[0] } + host = scheme + "://" + host //url匹配正则 @@ -111,7 +119,6 @@ func urlFind(cont, host, scheme, path, source string, num int) { for _, re := range config.UrlFind { reg := regexp.MustCompile(re) urls := reg.FindAllStringSubmatch(cont, -1) - //fmt.Println(urls) urls = urlFilter(urls) //循环提取url放到结果中 @@ -119,6 +126,12 @@ func urlFind(cont, host, scheme, path, source string, num int) { if url[0] == "" { continue } + + // base标签的处理 #### + if judge_base { + url[0] = path + url[0] + } + if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") { switch AppendUrl(url[0], source) { case 0: diff --git a/crawler/run.go b/crawler/run.go index baa0cc2..d21d996 100644 --- a/crawler/run.go +++ b/crawler/run.go @@ -3,25 +3,35 @@ package crawler import ( "bufio" "crypto/tls" + "encoding/json" "flag" "fmt" - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/mode" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net" "net/http" "net/url" "os" "regexp" + "strconv" "strings" "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/mode" + "github.com/pingc0y/URLFinder/result" + "github.com/pingc0y/URLFinder/util" ) var client *http.Client +// 用来存储响应体和响应头数据 +var ResBodyMap = make(map[string]string, 0) +var ResHeaderMap = make(map[string]proto.NetworkHeaders, 0) + func load() { if cmd.I { @@ -177,13 +187,23 @@ func ValidateFF() { for i, s := range result.ResultJs { config.Wg.Add(1) config.Jsch <- 1 - go JsState(s.Url, i, result.ResultJs[i].Source) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag) } //验证URL状态 for i, s := range result.ResultUrl { config.Wg.Add(1) config.Urlch <- 1 - go UrlState(s.Url, i) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go UrlState(s.Url, i, rod_flag) } config.Wg.Wait() @@ -199,14 +219,240 @@ func ValidateFF() { AddSource() } +// 定义函数 url_parse,参数是一个字符串 u,返回值是三个字符串 +func url_parse(u string) (string, string, string) { + // 解析 u 为一个 URL 对象 + u_str, err := url.Parse(u) + // 如果解析出错,就返回空字符串 + if err != nil { + return "", "", "" + } + // 获取 URL 对象的 host、scheme、path 属性 + host := u_str.Host + scheme := u_str.Scheme + path := u_str.Path + // 返回这三个属性的值 + return host, scheme, path +} + +// 提取响应体中的 Base 标签信息 +func extractBase(host, scheme, path, result string) (string, string, string, bool) { + judge_base := false + // 处理base标签 + re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + host = regexp.MustCompile("http.*?//([^/]+)").FindAllStringSubmatch(base[0][1], -1)[0][1] + scheme = regexp.MustCompile("(http.*?)://").FindAllStringSubmatch(base[0][1], -1)[0][1] + paths := regexp.MustCompile("http.*?//.*?(/.*)").FindAllStringSubmatch(base[0][1], -1) + if len(paths) > 0 { + path = paths[0][1] + } else { + path = "/" + } + } else { // 处理 "base 标签" + re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"") + base := re.FindAllStringSubmatch(result, -1) + if len(base) > 0 { + pattern := "[^.\\/\\w]" + re, _ := regexp.Compile(pattern) + // 检查字符串是否包含匹配的字符 + result := re.MatchString(base[0][1]) + if !result { // 字符串中没有其他特殊字符 + if len(base[0][1]) > 1 && base[0][1][:2] == "./" { // base 路径从当前目录出发 + judge_base = true + path = path[:strings.LastIndex(path, "/")] + base[0][1][1:] + } else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { // base 路径从上一级目录出发 + judge_base = true + pattern := "^[./]+$" + matched, _ := regexp.MatchString(pattern, base[0][1]) + if matched { // 处理的 base 路径中只有 ./的 + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } else { + find_str := "" + if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") { + find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3] + } else { + find_str = base[0][1][3:] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1][3:] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base 标签包含协议 + judge_base = true + path = base[0][1] + } else if len(base[0][1]) > 0 { + judge_base = true + if base[0][1][0] == 47 { //base 路径从根目录出发 + path = base[0][1] + } else { //base 路径未指明从哪路出发 + find_str := "" + if strings.Contains(base[0][1], "/") { + find_str = base[0][1][:strings.Index(base[0][1], "/")] + } else { + find_str = base[0][1] + } + if strings.Contains(path, find_str) { + path = path[:strings.Index(path, find_str)] + base[0][1] + } else { + path = path[:strings.LastIndex(path, "/")+1] + base[0][1] + } + } + } + if !strings.HasSuffix(path, "/") { + path += "/" + } + } + } + } + return host, scheme, path, judge_base +} + +// 获取网页加载的事件的响应体 +func rod_spider(u string, num int) { + // 初始化浏览器,无头浏览器:Headless(true) + launch := launcher.New().Headless(false).Set("test-type").Set("ignore-certificate-errors"). + NoSandbox(true).Set("disable-gpu").Set("disable-plugins").Set("incognito"). + Set("no-default-browser-check").Set("disable-dev-shm-usage"). + Set("disable-plugins").MustLaunch() + browser := rod.New().ControlURL(launch).MustConnect() + + // 添加关闭 + defer browser.Close() + + // 设置浏览器的证书错误处理,忽略所有证书错误 + browser.MustIgnoreCertErrors(true) + + // 设置浏览器打开的页面 + pageTarget := proto.TargetCreateTarget{URL: u} + page, err := browser.Page(pageTarget) + if err != nil { + fmt.Println(err) + } + + // 在最后关闭页面 + defer func() { + err := page.Close() + if err != nil { + fmt.Println(err) + } + }() + + // 设置页面的超时时间为 40 秒 + page = page.Timeout(40 * time.Second) + + // 创建一个空的 map,键是 proto.NetworkRequestID 类型,值是 string 类型 + requestMap := make(map[string]string, 0) + + // 使用 go 语句开启一个协程,在协程中处理页面的一些事件 + go page.EachEvent(func(e *proto.PageJavascriptDialogOpening) { + // 处理 JavaScript 对话框 + _ = proto.PageHandleJavaScriptDialog{Accept: true, PromptText: ""}.Call(page) + }, func(e *proto.NetworkResponseReceived) { + // 获取请求的 ID 和 URL + ResponseURL := e.Response.URL + // fmt.Println(e.Response.URL, e.RequestID) + ResHeaderMap[ResponseURL] = e.Response.Headers + + // 在 requestMap 中填充数据 + requestMap[ResponseURL] = "" + + })() + + // 等待页面加载完成,并处理可能出现的错误 + pageLoadErr := page.WaitLoad() + if pageLoadErr != nil { + fmt.Println(pageLoadErr) + } + + // 等待页面的 DOM 结构稳定 + page.WaitStable(2 * time.Second) + + // 打印页面源码 + htmlStr, err := page.HTML() + if err != nil { + fmt.Println(err) + } + + for url, _ := range requestMap { + // 调用 page.GetResource 方法来获取响应体 + ResponseBody, _ := page.GetResource(url) + requestMap[url] = string(ResponseBody) + } + + // 存储页面源码 + requestMap[u] = string(htmlStr) + // fmt.Println(requestMap[u]) + + // 遍历响应体,提取 Base 标签、提取 js 、提取 url 、 + for url, body := range requestMap { + // 判断响应体是否为空 + if len(body) == 0 { + continue + } + + // 遍历 BodyFiler 切片中的每个元素 + re := regexp.MustCompile("\\.jpeg\\?|\\.jpg\\?|\\.png\\?|.gif\\?|www\\.w3\\.org|example\\.com|.*,$|.*\\.jpeg$|.*\\.jpg$|.*\\.png$|.*\\.gif$|.*\\.ico$|.*\\.svg$|.*\\.vue$|.*\\.ts$") + if re.MatchString(url) { + continue + } + + // 添加body数据 + ResBodyMap[url] = body + + // 将响应头数据转换成map存储 + Res_header := make(map[string]string, 0) + if len(ResHeaderMap[url]) != 0 { + data, err := json.Marshal(ResHeaderMap[url]) + if err != nil { + fmt.Println(err) + } + err = json.Unmarshal(data, &Res_header) + if err != nil { + fmt.Println(err) + } + } + + // 添加首页动态加载的数据 + if strings.HasSuffix(url, ".js") || strings.Contains(url, ".js?") { + result.ResultJs = append(result.ResultJs, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) + } else { + result.ResultUrl = append(result.ResultUrl, mode.Link{Url: url, Status: strconv.Itoa(200), Size: strconv.Itoa(len(body)), ResponseHeaders: Res_header, ResponseBody: body}) + } + + host, scheme, path := url_parse(url) + + judge_base := false + host, scheme, path, judge_base = extractBase(host, scheme, path, body) + + //提取js + jsFind(body, host, scheme, path, u, num, judge_base) + //提取url + urlFind(body, host, scheme, path, u, num, judge_base) + // 防止base判断错误 + if judge_base { + jsFind(body, host, scheme, path, u, num, false) + urlFind(body, host, scheme, path, u, num, false) + } + + } + +} + func start(u string) { fmt.Println("Target URL: " + u) - config.Wg.Add(1) - config.Ch <- 1 - go Spider(u, 1) - config.Wg.Wait() - config.Progress = 1 - fmt.Printf("\r\nSpider OK \n") + + // config.Wg.Add(1) + // config.Ch <- 1 + // go Spider(u, 1) // ### + rod_spider(u, 1) + // config.Wg.Wait() + // config.Progress = 1 + + fmt.Printf("\r\nRod_Spider OK \n") result.ResultUrl = util.RemoveRepeatElement(result.ResultUrl) result.ResultJs = util.RemoveRepeatElement(result.ResultJs) if cmd.S != "" { @@ -217,13 +463,23 @@ func start(u string) { for i, s := range result.ResultJs { config.Wg.Add(1) config.Jsch <- 1 - go JsState(s.Url, i, result.ResultJs[i].Source) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go JsState(s.Url, i, result.ResultJs[i].Source, rod_flag) } //验证URL状态 for i, s := range result.ResultUrl { config.Wg.Add(1) config.Urlch <- 1 - go UrlState(s.Url, i) + // 判断响应数据是否已经在页面加载过程存储 + rod_flag := false + if len(ResBodyMap[s.Url]) != 0 { + rod_flag = true + } + go UrlState(s.Url, i, rod_flag) } config.Wg.Wait() @@ -241,7 +497,7 @@ func start(u string) { func Res() { if len(result.ResultJs) == 0 && len(result.ResultUrl) == 0 { - fmt.Println("未获取到数据") + fmt.Println(os.Stdout, cmd.U, "Data not captured") return } //打印还是输出 @@ -273,11 +529,13 @@ func AppendJs(ur string, urltjs string) int { if err != nil { return 2 } + for _, eachItem := range result.ResultJs { if eachItem.Url == ur { return 0 } } + result.ResultJs = append(result.ResultJs, mode.Link{Url: ur}) if strings.HasSuffix(urltjs, ".js") { result.Jsinurl[ur] = result.Jsinurl[urltjs] @@ -301,6 +559,7 @@ func AppendUrl(ur string, urlturl string) int { if err != nil { return 2 } + for _, eachItem := range result.ResultUrl { if eachItem.Url == ur { return 0 @@ -383,5 +642,4 @@ func Initialization() { result.Jstourl = make(map[string]string) result.Urltourl = make(map[string]string) result.Redirect = make(map[string]bool) - } diff --git a/crawler/state.go b/crawler/state.go index 773946f..6034591 100644 --- a/crawler/state.go +++ b/crawler/state.go @@ -1,27 +1,34 @@ package crawler import ( - "github.com/pingc0y/URLFinder/cmd" - "github.com/pingc0y/URLFinder/config" - "github.com/pingc0y/URLFinder/mode" - "github.com/pingc0y/URLFinder/result" - "github.com/pingc0y/URLFinder/util" "io" "net/http" "net/url" "regexp" "strconv" "strings" + + "github.com/pingc0y/URLFinder/cmd" + "github.com/pingc0y/URLFinder/config" + "github.com/pingc0y/URLFinder/mode" + "github.com/pingc0y/URLFinder/result" + "github.com/pingc0y/URLFinder/util" ) // 检测js访问状态码 -func JsState(u string, i int, sou string) { +func JsState(u string, i int, sou string, rod_flag bool) { defer func() { config.Wg.Done() <-config.Jsch PrintProgress() }() + + // 首页动态加载的数据 已经存储 + if rod_flag { + return + } + if cmd.S == "" { result.ResultJs[i].Url = u return @@ -53,6 +60,12 @@ func JsState(u string, i int, sou string) { //增加header选项 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### + //加载yaml配置 if cmd.I { util.SetHeadersConfig(&request.Header) @@ -99,25 +112,41 @@ func JsState(u string, i int, sou string) { } else { length = len(dataBytes) } + + res_body := string(dataBytes) + res_headers := make(map[string]string, 0) + // 遍历响应头中的所有键值对 + for k, v := range response.Header { + // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对 + if len(v) > 0 { + res_headers[k] = v[0] + } + } + config.Lock.Lock() if result.Redirect[ur.String()] { code = 302 redirect = response.Request.URL.String() } config.Lock.Unlock() - result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect} + result.ResultJs[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } else { result.ResultJs[i].Url = "" } } // 检测url访问状态码 -func UrlState(u string, i int) { +func UrlState(u string, i int, rod_flag bool) { defer func() { config.Wg.Done() <-config.Urlch PrintProgress() }() + + // 首页动态加载的数据 已经存储 + if rod_flag { + return + } if cmd.S == "" { result.ResultUrl[i].Url = u return @@ -148,6 +177,11 @@ func UrlState(u string, i int) { //增加header选项 request.Header.Set("User-Agent", util.GetUserAgent()) request.Header.Set("Accept", "*/*") + u_str, err := url.Parse(u) + if err != nil { + return + } + request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //#### //加载yaml配置 if cmd.I { @@ -194,9 +228,19 @@ func UrlState(u string, i int) { } else { length = len(dataBytes) } - body := string(dataBytes) + + res_body := string(dataBytes) + res_headers := make(map[string]string, 0) + // 遍历响应头中的所有键值对 + for k, v := range response.Header { + // 如果值是一个切片,取第一个元素作为值,否则忽略该键值对 + if len(v) > 0 { + res_headers[k] = v[0] + } + } + re := regexp.MustCompile("<[tT]itle>(.*?)") - title := re.FindAllStringSubmatch(body, -1) + title := re.FindAllStringSubmatch(res_body, -1) config.Lock.Lock() if result.Redirect[ur.String()] { code = 302 @@ -205,9 +249,9 @@ func UrlState(u string, i int) { config.Lock.Unlock() if len(title) != 0 { - result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect} + result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Title: title[0][1], Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } else { - result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect} + result.ResultUrl[i] = mode.Link{Url: u, Status: strconv.Itoa(code), Size: strconv.Itoa(length), Redirect: redirect, ResponseHeaders: res_headers, ResponseBody: res_body} } } else { result.ResultUrl[i].Url = "" diff --git a/mode/mode.go b/mode/mode.go index 300e702..454c7b2 100644 --- a/mode/mode.go +++ b/mode/mode.go @@ -18,12 +18,14 @@ type Config struct { } type Link struct { - Url string - Status string - Size string - Title string - Redirect string - Source string + Url string + Status string + Size string + Title string + Redirect string + Source string + ResponseHeaders map[string]string + ResponseBody string } type Info struct {