-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoracle-base.py
More file actions
123 lines (108 loc) · 2.91 KB
/
oracle-base.py
File metadata and controls
123 lines (108 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
from lxml.html import fromstring
from lxml import cssselect
from lxml import html
from Queue import Queue
from threading import Thread
import os
'''
set the number of threads
@type {Number}
'''
num_fetch_threads = 30
'''
queue array
@type {[type]}
'''
queue_ = Queue()
'''
download url it would be like https://oracle-base.com/dba/{category}/{filename.{extension}}
@type {String}
'''
download_url = 'https://oracle-base.com/dba/'
'''
page url
@type {String}
'''
page_url = 'https://oracle-base.com/dba/scripts'
'''
tags category
@type {Array}
'''
catg_array = ['monitoring', '10g', '11g', '12c', 'constraints', 'miscellaneous', 'rac', 'resource_manager', 'script_creation', 'security', 'weblogic']
'''
script dir
'''
script_dir = os.path.dirname(os.path.abspath(__file__))
'''
folder name
'''
folder_name = "oracle-base"
'''
Destination dir
'''
dest_dir = os.path.join(script_dir, folder_name)
'''
function to fetch fata from url
@param url given input url
'''
def fetchData(url):
data =requests.get(url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'})
return data
'''
File write function.
@param filename filename
@param content text content
'''
def fileWrite(filename, content, category):
path = os.path.join(dest_dir, category, filename)
file_handler = open(path, 'w')
file_handler.write(content.text)
file_handler.close()
'''
Main Thread function which handle most of the things
'''
def MainThread(i, q):
"""This is the worker thread function.
It processes items in the queue one after
another. These daemon threads go into an
infinite loop, and only exit when
the main thread ends.
"""
while True:
queue_ = q.get()
#loop through till 60
for i in range(1, 60):
xpath_ = ''
#get the index
index = queue_[2]
for j in range(1, 3):
xpath_ = '//*[@id="content"]/div['+str(index)+']/div['+str(j)+']/ul/li['+str(i)+']/p/a/text()'
data = queue_[1].xpath(xpath_)
if data:
print "Downloading "+data[0]+" file of category "+queue_[0]
url = download_url+queue_[0]+'/'+data[0]
con = fetchData(url)
fileWrite(data[0], con, queue_[0])
q.task_done()
# Set up some threads to fetch the content
for i in range(num_fetch_threads):
worker = Thread(target=MainThread, args=(i, queue_,))
worker.setDaemon(True)
worker.start()
'''
Main Function
'''
def main():
index = 1;
#extract the main html content
page_content = fetchData(page_url)
tree = html.fromstring(page_content.content)
for catg in catg_array:
index = index + 1
path_ = os.path.join(dest_dir, catg)
os.makedirs(path_)
queue_.put((catg, tree, index))
queue_.join()
if __name__ == '__main__':
main()