Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ gem 'watir', '~> 6.19', '>= 6.19.1'
gem 'webdrivers', '~> 4.6'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'geckodriver-helper', '~> 0.0.3'
gem 'sqlite3'
6 changes: 6 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ GEM
geckodriver-helper (0.0.5)
archive-zip (~> 0.7)
io-like (0.3.1)
mini_portile2 (2.6.1)
nokogiri (1.12.5)
mini_portile2 (~> 2.6.1)
racc (~> 1.4)
nokogiri (1.12.5-x86_64-linux)
racc (~> 1.4)
racc (1.6.0)
Expand All @@ -17,6 +21,7 @@ GEM
childprocess (>= 0.5, < 5.0)
rexml (~> 3.2, >= 3.2.5)
rubyzip (>= 1.2.2)
sqlite3 (1.4.2)
watir (6.19.1)
regexp_parser (>= 1.2, < 3)
selenium-webdriver (>= 3.142.7)
Expand All @@ -32,6 +37,7 @@ PLATFORMS
DEPENDENCIES
geckodriver-helper (~> 0.0.3)
nokogiri (~> 1.11, >= 1.11.7)
sqlite3
watir (~> 6.19, >= 6.19.1)
webdrivers (~> 4.6)

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# rubysite scrapping process

Run `bundle install`
Run `ruby migration.rb`
Run `ruby tour_site_scraper.rb`

Binary file added db_tour_scraper.db
Binary file not shown.
48 changes: 48 additions & 0 deletions migration.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
require 'sqlite3'

db = SQLite3::Database.open "db_tour_scraper.db"

puts "Migration started ...."

db.execute <<SQL
CREATE TABLE IF NOT EXISTS tickets_summary (
id INTEGER PRIMARY KEY AUTOINCREMENT,
departure_date Date,
return_date Date,
time_from_out VARCHAR(20),
time_to_out VARCHAR(20),
search_time DateTime,
total_tickets_out INTEGER,
total_tickets_in INTEGER
);
SQL
puts "Table tickets_summary created"

db.execute <<SQL
CREATE TABLE IF NOT EXISTS tickets_airline_companies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticket_summary_id INTEGER,
airline_company_name VARCHAR(100),
ticket_lowest_price FLOAT,
total_flights_available INTEGER,
ticket_type VARCHAR(10),
FOREIGN KEY(ticket_summary_id) REFERENCES tickets_summary(id)
);
SQL
puts "Table tickets_airline_companies created"

db.execute <<SQL
CREATE TABLE IF NOT EXISTS airline_flights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticket_airline_id INTEGER,
flight_code VARCHAR(50),
flight_price INTEGER,
flight_changeable_status VARCHAR(50),
flight_ticket_type VARCHAR(100),
FOREIGN KEY(ticket_airline_id) REFERENCES tickets_airlines(id)
);
SQL

puts "Table airline_flights created"

puts "Migration ended."
211 changes: 154 additions & 57 deletions tour_site_scraper.rb
Original file line number Diff line number Diff line change
@@ -1,92 +1,189 @@
require 'rubygems'
require 'selenium-webdriver'
require 'pry'
require 'sqlite3'

MAX_RETRY = 100
WAIT = Selenium::WebDriver::Wait.new(timeout: 20)
DB = SQLite3::Database.new("db_tour_scraper.db")

MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds
MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown

# Put Ticket Search input dates here
TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31)
TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31)
TIME_FROM_OUT = '0600'
TIME_TO_OUT = '0700'

WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html
WEB_DRIVER = Selenium::WebDriver.for :firefox

# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless'])
# driver = Selenium::WebDriver.for(:firefox, options: options)
driver = Selenium::WebDriver.for :firefox

puts 'Trying to fetch data from site.....'
puts '--------------------------------------------------------'

def check_return_tickets_visibility(driver)
begin
# Wait for few seconds until able to find return tickets list
WAIT.until { driver.find_element(css: "#Act_response_in .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_in .airline-name").displayed? }
rescue Exception
end
end

ticket_search_date_from = Date.new(2021, 12, 31)
ticket_search_date_to = Date.new(2022, 01, 31)
ticket_search_date_from.upto(ticket_search_date_to) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts "\n\nTickets for this date " + dt.to_s

def start_scraping(departure_date_in, departure_date_out)
# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement
driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0'
sleep(1) # Wait 1s to load the page properly
WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" +
"date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" +
"CTS&dpt_out=TYO&time_from_out=#{TIME_FROM_OUT}&time_to_out=#{TIME_TO_OUT}&time_type_out=0"
sleep(1)
begin
retries ||= 0
ticket_summary_button_out = nil
ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In')
ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In')
return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil?
ticket_summary_button_out.click
ticket_summary_button_in.click

WAIT.until { driver.find_element(css: "#Act_response_out .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_out .airline-name").displayed? }
rescue Exception => e
puts 'Trying to fetch data.. ' + retries.to_s
retries += 1
sleep(1) # Wait 1s to load the page properly
retry if (retries <= MAX_RETRY)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

check_return_tickets_visibility(driver)

# Scrap Available Tickets Elements
ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name')
ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company')
def searching_ticket_type(ticket_details_type)
if ticket_details_type == 'in'
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box')
else
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box')
end

# Parse elements to find each companies available ticket and sum
total_available_ticket = 0
ticket_available_lists&.each do |ticket_count|
total_available_ticket += ticket_count.text.delete('^0-9').to_i
total_ticket_found = 0
all_tickets_details_lists = []
ticket_airlines&.each do |ticket_airline|
temp_ticket_airline_info = {}
number_of_ticket_found = 0

ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text
number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i
total_ticket_found += number_of_ticket_found
ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text

temp_ticket_airline_info[:ticket_company_name] = ticket_company_name
temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price
temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found

ticket_flight_lists = []
ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list')
ticket_airline_flights_lists&.each do |ticket_flight|
flight_data = {}
flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML")
flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML")
flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML")
flight_data['flight_changeable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML")
flight_data['flight_type'] = ticket_flight.find_elements(:css,
'.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML")
ticket_flight_lists.push(flight_data)
end
temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists
all_tickets_details_lists.push(temp_ticket_airline_info)
end
return all_tickets_details_lists, total_ticket_found
end

# Scrap Returning Tickets Elements
ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name')
ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company')
# Save tickets scraped data to database SQLite into different tables
def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date)
all_ticket_out_lists = tickets_out_lists[0]
all_ticket_in_lists = tickets_in_lists[0]
total_ticket_out_found = tickets_out_lists[1]
total_ticket_in_found = tickets_in_lists[1]
puts "Total tickets found for out is = " + total_ticket_out_found.to_s
puts "Total tickets found for in is = " + total_ticket_in_found.to_s

# Save ticket summary
ticket_summary_data = [
nil,
departure_date.to_s,
return_date.to_s,
TIME_FROM_OUT,
TIME_TO_OUT,
Time.now.strftime("%Y-%m-%d %H:%M:%S"),
total_ticket_out_found, total_ticket_in_found
]
DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ticket_summary_data)
ticket_summary_id = DB.last_insert_row_id()

# Save all available out/departure tickets comapny and comapnies flights data
all_ticket_out_lists.each do |tickets_out|
# Save company tickets informations
company_data = [
nil,
ticket_summary_id,
tickets_out[:ticket_company_name],
tickets_out[:ticket_minimum_price],
tickets_out[:number_of_ticket_found],
'out'
]
DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", company_data)

# Save ticket flights information
ticket_out_company_id = DB.last_insert_row_id()
tickets_out[:ticket_flight_lists].each do |flight|
flight_data = [
nil,
ticket_out_company_id,
flight['flight_code'],
flight['flight_price'],
flight['flight_changeable_status'],
flight['flight_type']
]
DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data)
end
end

# Parse elements to find each companies returning tickets and sum
total_available_ticket_in = 0
ticket_available_lists_in&.each do |ticket_count_in|
total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i
# Save all available in/return tickets comapny and comapnies flights data
all_ticket_in_lists.each do |tickets_in|
# Save company tickets informations
ticket_in_company_data = [
nil,
ticket_summary_id,
tickets_in[:ticket_company_name],
tickets_in[:ticket_minimum_price],
tickets_in[:number_of_ticket_found],
'in'
]
DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ticket_in_company_data)
ticket_in_company_id = DB.last_insert_row_id()
tickets_in[:ticket_flight_lists].each do |flight|
flight_data = [
nil,
ticket_in_company_id,
flight['flight_code'],
flight['flight_price'],
flight['flight_changeable_status'],
flight['flight_type']
]
DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data)
end
end
rows = DB.execute( "select * from tickets_summary" )
end

# Write all tickets search results
puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s
puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s
TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts 'Available ticket IN companies name : '
puts '------------------------------------'
ticket_summary_in&.each do |ticket_cmpany_in|
puts ticket_cmpany_in.text.to_s + ', '
end
puts "\n\nTickets for this date " + dt.to_s

puts
puts 'Available ticket OUT companies name : '
puts '-------------------------------------'
ticket_summary&.each do |ticket_cmpany|
puts ticket_cmpany.text.to_s + ', '
begin
retries ||= 0
start_scraping(departure_date_in, departure_date_out)
# Wait for few seconds until able to find return tickets list
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? }
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? }
tickets_out_lists = searching_ticket_type('out')
tickets_in_lists = searching_ticket_type('in')
rescue Exception
retries += 1
retry if (retries <= MAX_CALL)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

# Save scraped ticket details, initially departure date and return date is same
save_scrap_data(tickets_out_lists, tickets_in_lists, dt, dt)
end