-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.rb
More file actions
79 lines (67 loc) · 2.05 KB
/
scrape.rb
File metadata and controls
79 lines (67 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# frozen_string_literal: true
require 'csv'
require 'httparty'
require 'nokogiri'
load 'create_epub.rb'
load 'utils.rb'
def scrape_chapter(url)
response = HTTParty.get(url)
return unless response.code == 200
document = Nokogiri::HTML4(response.body)
container = document.css('.cont')
title = container.css('b').text
content = '<div>'
content += "<h1>#{container.css('b').first.parent}</h1>"
content += container.css('.contson').to_s.strip
content += '</div>'
puts content
[title, content]
end
def parse_chapter_a(chapter_a, base_url)
chapter_title = chapter_a.text
return unless chapter_title != '跋'
url = base_url + chapter_a.attribute('href')
puts "下载 #{chapter_title} 中 ( #{url} )"
scrape_chapter(url)
end
def create_csv(book_url, title, base_url)
response = HTTParty.get(book_url)
return unless response.code == 200
body = []
CSV.open(get_file_path(title, 'csv'), 'w+', write_headers: false, headers: %w[Title Body]) do |csv|
Nokogiri::HTML4(response.body).css('span a').each do |chapter_a|
chapter = parse_chapter_a(chapter_a, base_url)
if chapter
body.push(chapter)
csv << chapter
end
end
end
body
end
def read_csv(title)
body = []
file = File.expand_path(get_file_path(title, 'csv'), __dir__)
CSV.foreach(file) do |chapter|
body.push(chapter)
end
body
end
def scrape(book_url, base_url, title, author, contributors)
body = if File.exist?(get_file_path(title, 'csv'))
read_csv(title)
else
create_csv(book_url, title, base_url)
end
# body = create_csv(book_url, title, base_url)
generate_epub(book_url, title, author, contributors, body)
end
base_url = 'https://so.gushiwen.cn'
contributors = %w[古诗文网 陈刑]
booklist = File.expand_path('booklist.csv', __dir__)
CSV.read(booklist).each do |book_data|
book_url = book_data[0]
book_title = book_data[1]
author = book_data[2]
scrape(book_url, base_url, book_title, author, contributors)
end