Skip to content
104 changes: 77 additions & 27 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@ package crawler
import (
"compress/gzip"
"fmt"
"github.com/pingc0y/URLFinder/cmd"
"github.com/pingc0y/URLFinder/config"
"github.com/pingc0y/URLFinder/result"
"github.com/pingc0y/URLFinder/util"
"io"
"net/http"
"net/url"
"regexp"
"strings"

"github.com/pingc0y/URLFinder/cmd"
"github.com/pingc0y/URLFinder/config"
"github.com/pingc0y/URLFinder/result"
"github.com/pingc0y/URLFinder/util"
)

// 蜘蛛抓取页面内容
Expand Down Expand Up @@ -53,6 +54,12 @@ func Spider(u string, num int) {
request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快
request.Header.Set("User-Agent", util.GetUserAgent())
request.Header.Set("Accept", "*/*")
u_str, err := url.Parse(u)
if err != nil {
return
}
request.Header.Set("Referer", u_str.Scheme+"://"+u_str.Host) //####

//增加header选项
if cmd.C != "" {
request.Header.Set("Cookie", cmd.C)
Expand All @@ -62,27 +69,6 @@ func Spider(u string, num int) {
util.SetHeadersConfig(&request.Header)
}

//处理返回结果
//tr := &http.Transport{
// TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
//}
//client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second,
// Transport: tr,
// CheckRedirect: func(req *http.Request, via []*http.Request) error {
// if len(via) >= 10 {
// return fmt.Errorf("Too many redirects")
// }
// if len(via) > 0 {
// if via[0] != nil && via[0].URL != nil {
// result.Redirect[via[0].URL.String()] = true
// } else {
// result.Redirect[req.URL.String()] = true
// }
//
// }
// return nil
// },
//}
response, err := client.Do(request)
if err != nil {
return
Expand Down Expand Up @@ -115,6 +101,7 @@ func Spider(u string, num int) {
host := response.Request.URL.Host
scheme := response.Request.URL.Scheme
source := scheme + "://" + host + path
judge_base := false //####
//处理base标签
re := regexp.MustCompile("base.{1,5}href.{1,5}(http.+?//[^\\s]+?)[\"'‘“]")
base := re.FindAllStringSubmatch(result, -1)
Expand All @@ -127,13 +114,76 @@ func Spider(u string, num int) {
} else {
path = "/"
}
} else { // 处理 "base 标签"
re := regexp.MustCompile("(?i)base.{0,5}[:=]\\s*\"(.*?)\"")
base := re.FindAllStringSubmatch(result, -1)
if len(base) > 0 {
pattern := "[^.\\/\\w]"
re, _ := regexp.Compile(pattern)
// 检查字符串是否包含匹配的字符
result := re.MatchString(base[0][1])
if !result { // 字符串中没有其他特殊字符
if len(base[0][1]) > 1 && base[0][1][:2] == "./" { //base 路径从当前目录出发
judge_base = true
path = path[:strings.LastIndex(path, "/")] + base[0][1][1:]
} else if len(base[0][1]) > 2 && base[0][1][:3] == "../" { //base 路径从上一级目录出发
judge_base = true
pattern := "^[./]+$"
matched, _ := regexp.MatchString(pattern, base[0][1])
if matched { // 处理的 base 路径中只有 ./的
path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
} else {
find_str := ""
if strings.Contains(strings.TrimPrefix(base[0][1], "../"), "/") {
find_str = base[0][1][3 : strings.Index(strings.TrimPrefix(base[0][1], "../"), "/")+3]
} else {
find_str = base[0][1][3:]
}
if strings.Contains(path, find_str) {
path = path[:strings.Index(path, find_str)] + base[0][1][3:]
} else {
path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
}
}
} else if len(base[0][1]) > 4 && strings.HasPrefix(base[0][1], "http") { // base标签包含协议
judge_base = true
path = base[0][1]
} else if len(base[0][1]) > 0 {
judge_base = true
if base[0][1][0] == 47 { //base 路径从根目录出发
path = base[0][1]
} else { //base 路径未指明从哪出发
find_str := ""
if strings.Contains(base[0][1], "/") {
find_str = base[0][1][:strings.Index(base[0][1], "/")]
} else {
find_str = base[0][1]
}
if strings.Contains(path, find_str) {
path = path[:strings.Index(path, find_str)] + base[0][1]
} else {
path = path[:strings.LastIndex(path, "/")+1] + base[0][1]
}
}
}
if !strings.HasSuffix(path, "/") {
path += "/"
}
}
}
}

is = false
<-config.Ch
//提取js
jsFind(result, host, scheme, path, u, num)
jsFind(result, host, scheme, path, u, num, judge_base)
//提取url
urlFind(result, host, scheme, path, u, num)
urlFind(result, host, scheme, path, u, num, judge_base)
// 防止base判断错误
if judge_base {
jsFind(result, host, scheme, path, u, num, false)
urlFind(result, host, scheme, path, u, num, false)
}
//提取信息
infoFind(result, source)

Expand Down
17 changes: 15 additions & 2 deletions crawler/filter.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
package crawler

import (
"github.com/pingc0y/URLFinder/config"
"net/url"
"regexp"
"strings"

"github.com/pingc0y/URLFinder/config"
)

// 过滤JS
func jsFilter(str [][]string) [][]string {

//对不需要的数据过滤
// 对不需要的数据过滤
for i := range str {
// 针对QueryUnescape函数做出了简单的预先处理
if strings.Contains(str[i][1], "%s%s:%s") {
str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
}

str[i][0], _ = url.QueryUnescape(str[i][1])
str[i][0] = strings.TrimSpace(str[i][0])
str[i][0] = strings.Replace(str[i][0], " ", "", -1)
Expand Down Expand Up @@ -44,12 +51,18 @@ func urlFilter(str [][]string) [][]string {

//对不需要的数据过滤
for i := range str {
// 针对QueryUnescape函数做出了简单的预先处理
if strings.Contains(str[i][1], "%s%s:%s") {
str[i][1] = strings.Replace(str[i][1], "%s%s:%s", "", -1)
str[i][0] = strings.Replace(str[i][0], "%s%s:%s", "", -1)
}
str[i][0], _ = url.QueryUnescape(str[i][1])
str[i][0] = strings.TrimSpace(str[i][0])
str[i][0] = strings.Replace(str[i][0], " ", "", -1)
str[i][0] = strings.Replace(str[i][0], "\\/", "/", -1)
str[i][0] = strings.Replace(str[i][0], "%3A", ":", -1)
str[i][0] = strings.Replace(str[i][0], "%2F", "/", -1)

//去除不存在字符串和数字的url,判断为错误数据
match, _ := regexp.MatchString("[a-zA-Z]+|[0-9]+", str[i][0])
if !match {
Expand Down
23 changes: 18 additions & 5 deletions crawler/find.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
package crawler

import (
"regexp"
"strings"

"github.com/pingc0y/URLFinder/cmd"
"github.com/pingc0y/URLFinder/config"
"github.com/pingc0y/URLFinder/mode"
"github.com/pingc0y/URLFinder/result"
"regexp"
"strings"
)

// 分析内容中的js
func jsFind(cont, host, scheme, path, source string, num int) {
func jsFind(cont, host, scheme, path, source string, num int, judge_base bool) {
var cata string
care := regexp.MustCompile("/.*/{1}|/")
catae := care.FindAllString(path, -1)
Expand All @@ -31,6 +32,12 @@ func jsFind(cont, host, scheme, path, source string, num int) {
if js[0] == "" {
continue
}

// base标签的处理 ####
if judge_base {
js[0] = path + js[0]
}

if strings.HasPrefix(js[0], "https:") || strings.HasPrefix(js[0], "http:") {
switch AppendJs(js[0], source) {
case 0:
Expand Down Expand Up @@ -95,7 +102,7 @@ func jsFind(cont, host, scheme, path, source string, num int) {
}

// 分析内容中的url
func urlFind(cont, host, scheme, path, source string, num int) {
func urlFind(cont, host, scheme, path, source string, num int, judge_base bool) {
var cata string
care := regexp.MustCompile("/.*/{1}|/")
catae := care.FindAllString(path, -1)
Expand All @@ -104,21 +111,27 @@ func urlFind(cont, host, scheme, path, source string, num int) {
} else {
cata = catae[0]
}

host = scheme + "://" + host

//url匹配正则

for _, re := range config.UrlFind {
reg := regexp.MustCompile(re)
urls := reg.FindAllStringSubmatch(cont, -1)
//fmt.Println(urls)
urls = urlFilter(urls)

//循环提取url放到结果中
for _, url := range urls {
if url[0] == "" {
continue
}

// base标签的处理 ####
if judge_base {
url[0] = path + url[0]
}

if strings.HasPrefix(url[0], "https:") || strings.HasPrefix(url[0], "http:") {
switch AppendUrl(url[0], source) {
case 0:
Expand Down
Loading