From 6b9aa913455433a78c1910e2800fe56d6ab97f2d Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Thu, 9 Feb 2017 21:32:20 +0900 Subject: [PATCH 01/23] =?UTF-8?q?2=20=EC=9B=B9=20=EB=8D=B0=EC=9D=B4?= =?UTF-8?q?=ED=84=B0=20=EC=88=98=EC=A7=91=20-=201=20=EC=9B=B9=20=ED=8E=98?= =?UTF-8?q?=EC=9D=B4=EC=A7=80=20=EB=8B=A4=EC=9A=B4=EB=A1=9C=EB=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jpub-web-crawling-technique/02/.gitignore | 3 +++ .../02/01-download/01-download-node.js | 22 +++++++++++++++ .../02/01-download/02-download-node-func.js | 27 +++++++++++++++++++ .../02/package.json | 23 ++++++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 node/jpub-web-crawling-technique/02/.gitignore create mode 100644 node/jpub-web-crawling-technique/02/01-download/01-download-node.js create mode 100644 node/jpub-web-crawling-technique/02/01-download/02-download-node-func.js create mode 100644 node/jpub-web-crawling-technique/02/package.json diff --git a/node/jpub-web-crawling-technique/02/.gitignore b/node/jpub-web-crawling-technique/02/.gitignore new file mode 100644 index 0000000000..7e7fb67ac7 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +*/*.html +*/img/ diff --git a/node/jpub-web-crawling-technique/02/01-download/01-download-node.js b/node/jpub-web-crawling-technique/02/01-download/01-download-node.js new file mode 100644 index 0000000000..39b6b4e216 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/01-download/01-download-node.js @@ -0,0 +1,22 @@ +// url에 있는 파일을 savepath에 다운로드한다 + +// 다운로드할 url을 지정 +var url = 'http://jpub.tistory.com/'; +// 저장할 위치를 지정 +var savepath = 'test.html'; + +// 사용 모듈 정의 +var http = require('http'); // HTTP 모듈 +var fs = require('fs'); // 파일 처리 관련 모듈 + +// 출력 지정 +var outfile = fs.createWriteStream(savepath); + +// 비동기로 url의 파일 다운로드 +http.get(url, function(res) { + res.pipe(outfile); + res.on('end', function() { + outfile.close(); + console.log('ok'); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/01-download/02-download-node-func.js b/node/jpub-web-crawling-technique/02/01-download/02-download-node-func.js new file mode 100644 index 0000000000..6685ada181 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/01-download/02-download-node-func.js @@ -0,0 +1,27 @@ +// 다운로드 +download ( + 'http://jpub.tistory.com/539', + 'spring.html', + function() { console.log('ok, spring') } +) + +download ( + 'http://jpub.tistory.com/537', + 'angular.html', + function() { console.log('ok, angular') } +) + +// url의 파일을 savepath에 다운로드하는 함수 +function download(url, savepath, callback) { + var http = require('http'); + var fs = require('fs'); + var outfile = fs.createWriteStream(savepath); + + var req = http.get(url, function(res) { + res.pipe(outfile); + res.on('end', function() { + outfile.close(); + callback(); + }) + }) +} diff --git a/node/jpub-web-crawling-technique/02/package.json b/node/jpub-web-crawling-technique/02/package.json new file mode 100644 index 0000000000..4a6395cf00 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/package.json @@ -0,0 +1,23 @@ +{ + "name": "jpub-web-crawling-technique-02", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+ssh://git@github.com/neverlish/neverlish.git" + }, + "author": "neverlish", + "license": "ISC", + "bugs": { + "url": "https://github.com/neverlish/neverlish/issues" + }, + "homepage": "https://github.com/neverlish/neverlish#readme", + "dependencies": { + "fs": "0.0.1-security", + "http": "0.0.0" + } +} From 806c553be804c90937cef4c7061d0e891399238a Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Thu, 9 Feb 2017 21:35:30 +0900 Subject: [PATCH 02/23] =?UTF-8?q?2=20-=202=20HTML=20=ED=95=B4=EC=84=9D(?= =?UTF-8?q?=EB=A7=81=ED=81=AC=EC=99=80=20=EC=9D=B4=EB=AF=B8=EC=A7=80=20?= =?UTF-8?q?=EC=B6=94=EC=B6=9C)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../02/02-analyze/01-getPage.js | 15 ++++++++ .../02/02-analyze/02-showlink.js | 16 +++++++++ .../02/02-analyze/03-url-test.js | 14 ++++++++ .../02/02-analyze/04-showlink-path.js | 27 ++++++++++++++ .../02/02-analyze/05-showimage.js | 18 ++++++++++ .../02/02-analyze/06-download-node-request.js | 10 ++++++ .../02/02-analyze/07-dl-image.js | 35 +++++++++++++++++++ .../02/package.json | 1 + 8 files changed, 136 insertions(+) create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/01-getPage.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/02-showlink.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/03-url-test.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/04-showlink-path.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/05-showimage.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/06-download-node-request.js create mode 100644 node/jpub-web-crawling-technique/02/02-analyze/07-dl-image.js diff --git a/node/jpub-web-crawling-technique/02/02-analyze/01-getPage.js b/node/jpub-web-crawling-technique/02/02-analyze/01-getPage.js new file mode 100644 index 0000000000..6e1ca40fa3 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/01-getPage.js @@ -0,0 +1,15 @@ +// 모듈 로드 +var client = require('cheerio-httpcli'); + +// 다운로드 +var url = 'http://jpub.tistory.com'; +var param = {}; + +client.fetch(url, param, function(err, $, res) { + // 에러 체크 + if (err) { console.log('Error:', err); return; } + + // 다운로드한 결과를 화면에 출력 + var body = $.html(); + console.log(body); +}) diff --git a/node/jpub-web-crawling-technique/02/02-analyze/02-showlink.js b/node/jpub-web-crawling-technique/02/02-analyze/02-showlink.js new file mode 100644 index 0000000000..398c9a1342 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/02-showlink.js @@ -0,0 +1,16 @@ +// 모듈 로드 +var client = require('cheerio-httpcli'); + +// 다운로드 +var url = 'http://jpub.tistory.com'; +var param = {}; +client.fetch(url, param, function(err, $, res) { + if (err) { console.log('error'); return; } + + // 링크를 추출하여 표시 + $('a').each(function(idx) { + var text = $(this).text(); + var href = $(this).attr('href'); + console.log(text + ' : ' + href); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/02-analyze/03-url-test.js b/node/jpub-web-crawling-technique/02/02-analyze/03-url-test.js new file mode 100644 index 0000000000..7a50975a26 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/03-url-test.js @@ -0,0 +1,14 @@ +// url 모듈 로드 +var urlType = require('url'); + +// 상대 경로를 절대 경로로 변환 +var base = 'http://kujirahand.com/url/test/index.html'; + +var u1 = urlType.resolve(base, 'a.html'); +console.log('u1 = ' + u1); + +var u2 = urlType.resolve(base, '../b.html'); +console.log('u2 = ' + u2); + +var u3 = urlType.resolve(base, '/c.html'); +console.log('u3 = ' + u3); diff --git a/node/jpub-web-crawling-technique/02/02-analyze/04-showlink-path.js b/node/jpub-web-crawling-technique/02/02-analyze/04-showlink-path.js new file mode 100644 index 0000000000..d017965a2c --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/04-showlink-path.js @@ -0,0 +1,27 @@ +// 모듈 로드 +var client = require('cheerio-httpcli'); +var urlType = require('url'); + +// URL과 파라미터 +var url = 'http://jpub.tistory.com'; +var param = {}; + +// 다운로드 +client.fetch(url, param, function(err, $, res) { + if (err) { console.log('error'); return; } + + // 링크 추출하여 출력 + $('a').each(function(idx) { + var text = $(this).text(); + var href = $(this).attr('href'); + + if (!href) return; + + // 상대 경로를 절대 경로로 변환 + var href2 = urlType.resolve(url, href); + + // 결과를 표시 + console.log(text + ' : ' + href); + console.log(' => ' + href2 + '\n'); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/02-analyze/05-showimage.js b/node/jpub-web-crawling-technique/02/02-analyze/05-showimage.js new file mode 100644 index 0000000000..33f35ada74 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/05-showimage.js @@ -0,0 +1,18 @@ +// 모듈 로드 +var client = require('cheerio-httpcli'); +var urlType = require('url'); + +// 다운로드 +var url = 'https://ko.wikipedia.org/wiki/' + encodeURIComponent('강아지'); +var param = {}; + +client.fetch(url, param, function(err, $, res) { + if (err) { console.log('error'); return; } + + // 링크를 추출하여 표시 + $('img').each(function(idx) { + var src = $(this).attr('src'); + src = urlType.resolve(url, src); + console.log(src); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/02-analyze/06-download-node-request.js b/node/jpub-web-crawling-technique/02/02-analyze/06-download-node-request.js new file mode 100644 index 0000000000..c82c387eb7 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/06-download-node-request.js @@ -0,0 +1,10 @@ +// 모듈 로드 +var request = require('request'); +var fs = require('fs'); + +// URL 저장 +var url = 'http://jpub.tistory.com/'; +var savepath = 'test.html'; + +// 다운로드 +request(url).pipe(fs.createWriteStream(savepath)); diff --git a/node/jpub-web-crawling-technique/02/02-analyze/07-dl-image.js b/node/jpub-web-crawling-technique/02/02-analyze/07-dl-image.js new file mode 100644 index 0000000000..96e9896ac0 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/02-analyze/07-dl-image.js @@ -0,0 +1,35 @@ +// 모듈 로드 +var client = require('cheerio-httpcli'); +var request = require('request'); +var fs = require('fs'); +var urlType = require('url'); + +// 저장할 디렉터리가 없으면 생성 +var savedir = __dirname + '/img'; +if (!fs.existsSync(savedir)) { + fs.mkdirSync(savedir); +} + +// URL 지정 +var url = 'https://ko.wikipedia.org/wiki/' + encodeURIComponent('강아지'); +var param = {}; + +// HTML 파일 획득 +client.fetch(url, param, function(err, $, res) { + if (err) { conosole.log('error'); return; } + + // img 링크 추출하여 각 링크에 대해 함수 수행 + $('img').each(function(idx) { + var src = $(this).attr('src'); + + // 상대 경로를 절대 경로로 변환 + src = urlType.resolve(url, src); + + // 저장 파일 이름 결정 + var fname = urlType.parse(src).pathname; + fname = savedir + '/' + fname.replace(/[^a-zA-Z0-9\.]+/g, '_'); + + // 다운로드 + request(src).pipe(fs.createWriteStream(fname)); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/package.json b/node/jpub-web-crawling-technique/02/package.json index 4a6395cf00..97c5e29027 100644 --- a/node/jpub-web-crawling-technique/02/package.json +++ b/node/jpub-web-crawling-technique/02/package.json @@ -17,6 +17,7 @@ }, "homepage": "https://github.com/neverlish/neverlish#readme", "dependencies": { + "cheerio-httpcli": "^0.6.11", "fs": "0.0.1-security", "http": "0.0.0" } From bfccd83943ab72110a07fa6353e70d40e3cc9e51 Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Thu, 9 Feb 2017 21:54:34 +0900 Subject: [PATCH 03/23] =?UTF-8?q?2=20-=203=20=EC=82=AC=EC=9D=B4=ED=8A=B8?= =?UTF-8?q?=EB=A5=BC=20=ED=86=B5=EC=A7=B8=EB=A1=9C=20=EB=8B=A4=EC=9A=B4?= =?UTF-8?q?=EB=A1=9C=EB=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jpub-web-crawling-technique/02/.gitignore | 1 + .../02/03-getall/01-getall.js | 76 +++++++++++++++++++ .../02/03-getall/02-mkdir.js | 9 +++ .../02/03-getall/03-mkdirSync.js | 7 ++ .../02/03-getall/04-mkdir.js | 10 +++ 5 files changed, 103 insertions(+) create mode 100644 node/jpub-web-crawling-technique/02/03-getall/01-getall.js create mode 100644 node/jpub-web-crawling-technique/02/03-getall/02-mkdir.js create mode 100644 node/jpub-web-crawling-technique/02/03-getall/03-mkdirSync.js create mode 100644 node/jpub-web-crawling-technique/02/03-getall/04-mkdir.js diff --git a/node/jpub-web-crawling-technique/02/.gitignore b/node/jpub-web-crawling-technique/02/.gitignore index 7e7fb67ac7..4b61772ad2 100644 --- a/node/jpub-web-crawling-technique/02/.gitignore +++ b/node/jpub-web-crawling-technique/02/.gitignore @@ -1,3 +1,4 @@ node_modules/ */*.html */img/ +*/nodejs.org/ diff --git a/node/jpub-web-crawling-technique/02/03-getall/01-getall.js b/node/jpub-web-crawling-technique/02/03-getall/01-getall.js new file mode 100644 index 0000000000..d17ce44822 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/03-getall/01-getall.js @@ -0,0 +1,76 @@ +// 링크를 분석해서 다운로드(Node.js) +// --- 모듈 로드 --- +var client = require('cheerio-httpcli'); +var request = require('request'); +var urlType = require('url'); +var fs = require('fs'); +var path = require('path'); + +// --- 공통 설정 --- +// 링크 탐색 단계 지정 +var LINK_LEVEL = 3; +// 기준 URL 페이지 +var TARGET_URL = 'https://nodejs.org/dist/latest-v6.x/docs/api/'; +var list = {}; + +// 메인 처리 +downloadRec(TARGET_URL, 0); + +// 지정 URL을 최대 level 단계까지 다운로드 +function downloadRec(url, level) { + // 최대 level 확인 + if (level >= LINK_LEVEL) return; + + // 이미 다운받은 사이트는 무시 + if (list[url]) return; + list[url] = true; + + // 외부 페이지는 무시 + var us = TARGET_URL.split('/'); + us.pop(); + var base = us.join('/'); + if (url.indexOf(base) < 0) return; + + // HTML을 취득 + client.fetch(url, {}, function(err, $, res) { + // 링크된 페이지를 취득 + $('a').each(function(idx) { + // 태그의 링크를 획득 + var href = $(this).attr('href'); + if (!href) return; + + // 상대 경로를 절대 경로로 반환 + href = urlType.resolve(url, href); + + // '#' 이후를 무시(a.html#aa와 a.html#bb는 같다) + href = href.replace(/\#.+$/, ''); // 말미의 #를 제거 + downloadRec(href, level + 1); + }); + + // 페이지 저장 + if (url.substr(url.length-1, 1) == '/') { + url += 'index.html'; // 인덱스 자동 추가 + } + + var savepath = url.split('/').slice(2).join('/'); + checkSaveDir(savepath); + console.log(savepath); + fs.writeFileSync(savepath, $.html()); + }); +} + +// 저장할 디렉터리 존재 유무 확인 +function checkSaveDir(fname) { + // 디렉터리 부분만 검출 + var dir = path.dirname(fname); + + // 디렉터리를 재귀적으로 생성 + var dirlist = dir.split('/'); + var p = ''; + for (var i in dirlist) { + p += dirlist[i] + '/'; + if (!fs.existsSync(p)) { + fs.mkdirSync(p); + } + } +} diff --git a/node/jpub-web-crawling-technique/02/03-getall/02-mkdir.js b/node/jpub-web-crawling-technique/02/03-getall/02-mkdir.js new file mode 100644 index 0000000000..a3fa89924a --- /dev/null +++ b/node/jpub-web-crawling-technique/02/03-getall/02-mkdir.js @@ -0,0 +1,9 @@ +// 모듈 로드 +var fs = require('fs'); + +// 폴더 생성 +console.log('mkdir 실행'); +fs.mkdir('test', function() { + console.log('폴더 생성 완료'); +}); +console.log('mkdir 실횅 완료. 결과 대기'); diff --git a/node/jpub-web-crawling-technique/02/03-getall/03-mkdirSync.js b/node/jpub-web-crawling-technique/02/03-getall/03-mkdirSync.js new file mode 100644 index 0000000000..72f6323804 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/03-getall/03-mkdirSync.js @@ -0,0 +1,7 @@ +// 모듈 로드 +var fs = require('fs'); + +// 디렉터리를 동기적으로 생성 +console.log('mkdir 실행'); +fs.mkdirSync('test-sync'); +console.log('mkdir 완료'); diff --git a/node/jpub-web-crawling-technique/02/03-getall/04-mkdir.js b/node/jpub-web-crawling-technique/02/03-getall/04-mkdir.js new file mode 100644 index 0000000000..06bebfc900 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/03-getall/04-mkdir.js @@ -0,0 +1,10 @@ +// 모듈 로드 +var fs = require('fs'); + +// 폴더를 동기적으로 생성 +if (!fs.existsSync('test3')) { + fs.mkdirSync('test3'); + console.log('test3 생성완료'); +} else { + console.log('test3이 이미 있으므로 생성 안함'); +} From 16b616eea94d7b77f7c7ee12e34657cc4228efad Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Fri, 10 Feb 2017 09:48:47 +0900 Subject: [PATCH 04/23] =?UTF-8?q?2=20-=204=20XML/RSS=20=ED=95=B4=EC=84=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../02/04-xmlrss/01-test-xml.js | 14 ++++++++ .../02/04-xmlrss/02-test-xml2.js | 25 ++++++++++++++ .../02/04-xmlrss/03-test-xml0.js | 10 ++++++ .../02/04-xmlrss/04-test-xml3.js | 20 +++++++++++ .../02/04-xmlrss/05-test-xml-builder.js | 12 +++++++ .../02/04-xmlrss/06-test-xml-builder2.js | 20 +++++++++++ .../02/04-xmlrss/07-weather.js | 33 +++++++++++++++++++ .../02/04-xmlrss/08-weather-cheerio.js | 23 +++++++++++++ .../02/package.json | 3 +- 9 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/01-test-xml.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/02-test-xml2.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/03-test-xml0.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/04-test-xml3.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/05-test-xml-builder.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/06-test-xml-builder2.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/07-weather.js create mode 100644 node/jpub-web-crawling-technique/02/04-xmlrss/08-weather-cheerio.js diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/01-test-xml.js b/node/jpub-web-crawling-technique/02/04-xmlrss/01-test-xml.js new file mode 100644 index 0000000000..84b92a43ee --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/01-test-xml.js @@ -0,0 +1,14 @@ +// 모듈 로드 +var parseString = require('xml2js').parseString; + +// 테스트용 XML 데이터 +var xml = '' + + 'Banana' + + 'Apple' + + ''; + +// XML 전달 +parseString(xml, function(err, result) { + // 파싱된 결과에 대한 처리를 여기에 작성 + console.log(JSON.stringify(result)); +}) diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/02-test-xml2.js b/node/jpub-web-crawling-technique/02/04-xmlrss/02-test-xml2.js new file mode 100644 index 0000000000..3f63527454 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/02-test-xml2.js @@ -0,0 +1,25 @@ +// 모듈 로드 +var parseString = require('xml2js').parseString; + +// 테스트용 XML 데이터 +var xml = '' + + 'Banana' + + 'Apple' + + ''; + +// XML을 전달 +parseString(xml, function(err, result) { + // console.log(JSON.stringify(result)); + + // fruits을 제공하는 가게 이름 + var shop = result.fruits.$.shop; + console.log('shop = ' + shop); + + // fruits의 이름과 가격을 표시 + var items = result.fruits.item; + for (var i in items) { + var item = items[i]; + console.log('-- name = ' + item._); + console.log(' price = ' + item.$.price); + } +}); diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/03-test-xml0.js b/node/jpub-web-crawling-technique/02/04-xmlrss/03-test-xml0.js new file mode 100644 index 0000000000..68c24190da --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/03-test-xml0.js @@ -0,0 +1,10 @@ +// 모듈 로드 +var parseString = require('xml2js').parseString; + +// 테스트용 데이터 +var xml = 'Banana'; + +// XML을 전달 +parseString(xml, function(err, result) { + console.log(result.item); // 결과: Banana +}) diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/04-test-xml3.js b/node/jpub-web-crawling-technique/02/04-xmlrss/04-test-xml3.js new file mode 100644 index 0000000000..ef8be26048 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/04-test-xml3.js @@ -0,0 +1,20 @@ +// 모듈 로드 +var parseString = require('xml2js').parseString; + +// 테스트용 XML 데이터 +var xml = + '' + + 'Banana130' + + 'Apple300' + + 'Pear250' + + ''; + +// XML 전달 +parseString(xml, function(err, result) { + console.log(JSON.stringify(result)); + + // 각 요소의 표시 + console.log('---'); + console.log(result.items.item[0].name[0]); + console.log(result.items.item[0].price[0]); +}) diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/05-test-xml-builder.js b/node/jpub-web-crawling-technique/02/04-xmlrss/05-test-xml-builder.js new file mode 100644 index 0000000000..1c2a68cfee --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/05-test-xml-builder.js @@ -0,0 +1,12 @@ +// 모듈 로드 +var xml2js = require('xml2js'); + +// 자바스크립트 객체 +var obj = { + item: {name: 'Banana', price: 150} +}; + +// XML로 변환 +var builder = new xml2js.Builder(); +var xml = builder.buildObject(obj); +console.log(xml); diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/06-test-xml-builder2.js b/node/jpub-web-crawling-technique/02/04-xmlrss/06-test-xml-builder2.js new file mode 100644 index 0000000000..a5c2bf3100 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/06-test-xml-builder2.js @@ -0,0 +1,20 @@ +// 모듈 로드 +var xml2js = require('xml2js'); +var parseString = xml2js.parseString; +var Builder = xml2js.Builder; + +// 테스트용 XML 데이터 +var xml = '' + + 'Banana' + + 'Apple' + + ''; + +// XML을 자바스크립트 객체로 변환 +parseString(xml, function(err, result) { + // 변환된 자바스크립트 객체 출력 + console.log(JSON.stringify(result)); + + // 변환된 자바스크립트 객체를 다시 XML로 변환 + var xml = new Builder().buildObject(result); + console.log(xml); +}) diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/07-weather.js b/node/jpub-web-crawling-technique/02/04-xmlrss/07-weather.js new file mode 100644 index 0000000000..06ff8178d8 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/07-weather.js @@ -0,0 +1,33 @@ +// 기상청 기상예보 RSS +var RSS = 'http://web.kma.go.kr/weather/forecast/mid-term-rss3.jsp?stnId=109'; + +// 모듈 로드 +var parseString = require('xml2js').parseString; +var request = require('request'); + +// RSS 다운로드 +request(RSS, function(err, res, body) { + if (!err & res.statusCode == 200) { + analyzeRSS(body); + } +}); + +// RSS 해석 +function analyzeRSS(xml) { + // XML을 JS 오브젝트로 변환 + parseString(xml, function(err, obj) { + if (err) { console.log(err); return; } + + // 기상 예보 정보 출력 + console.log(JSON.stringify(obj)); + var datas = obj.rss.channel[0].item[0].description[0].body[0].location[0].data; + var city = obj.rss.channel[0].item[0].description[0].body[0].location[0].city; + + for (var i in datas) { + var data = datas[i]; + console.log(city + ' ' + data.tmEf + ' ' + data.wf + ' ' + data.tmn + '~' + data.tmx); + } + + }) + +} diff --git a/node/jpub-web-crawling-technique/02/04-xmlrss/08-weather-cheerio.js b/node/jpub-web-crawling-technique/02/04-xmlrss/08-weather-cheerio.js new file mode 100644 index 0000000000..7e806d7e39 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/04-xmlrss/08-weather-cheerio.js @@ -0,0 +1,23 @@ +// 기상청 기상예보 RSS(cheerio 이용) for Node.js + +// 기상 RSS +var RSS = 'http://web.kma.go.kr/weather/forecast/mid-term-rss3.jsp?stnId=109'; + +// 모듈 로드 +var client = require('cheerio-httpcli'); + +// RSS 다운로드 +client.fetch(RSS, {}, function(err, $, res) { + if (err) { console.log(err); return; } + + // 필요한 항목을 추출해서 표시 + var city = $('location:nth-child(1) > city').text(); + $('location:nth-child(1) > data').each(function(idx) { + var tmEf = $(this).find('tmEf').text(); + var wf = $(this).find('wf').text(); + var tmn = $(this).find('tmn').text(); + var tmx = $(this).find('tmx').text(); + + console.log(city + ' ' + tmEf + ' ' + wf + ' ' + tmn + '~' + tmx); + }) +}) diff --git a/node/jpub-web-crawling-technique/02/package.json b/node/jpub-web-crawling-technique/02/package.json index 97c5e29027..f44bbf62e2 100644 --- a/node/jpub-web-crawling-technique/02/package.json +++ b/node/jpub-web-crawling-technique/02/package.json @@ -19,6 +19,7 @@ "dependencies": { "cheerio-httpcli": "^0.6.11", "fs": "0.0.1-security", - "http": "0.0.0" + "http": "0.0.0", + "xml2js": "^0.4.17" } } From 0208eed55c96f1f30812365aa3c55b7cc9a2c90c Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Fri, 10 Feb 2017 20:55:42 +0900 Subject: [PATCH 05/23] =?UTF-8?q?2=20-=205=20=EC=A0=95=EA=B8=B0=EC=A0=81?= =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EB=8B=A4=EC=9A=B4=EB=A1=9C=EB=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jpub-web-crawling-technique/02/.gitignore | 1 + .../02/05-cron/01-kawase-usd_krw.js | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 node/jpub-web-crawling-technique/02/05-cron/01-kawase-usd_krw.js diff --git a/node/jpub-web-crawling-technique/02/.gitignore b/node/jpub-web-crawling-technique/02/.gitignore index 4b61772ad2..6bab19218d 100644 --- a/node/jpub-web-crawling-technique/02/.gitignore +++ b/node/jpub-web-crawling-technique/02/.gitignore @@ -2,3 +2,4 @@ node_modules/ */*.html */img/ */nodejs.org/ +*/*.txt diff --git a/node/jpub-web-crawling-technique/02/05-cron/01-kawase-usd_krw.js b/node/jpub-web-crawling-technique/02/05-cron/01-kawase-usd_krw.js new file mode 100644 index 0000000000..ac956aecc5 --- /dev/null +++ b/node/jpub-web-crawling-technique/02/05-cron/01-kawase-usd_krw.js @@ -0,0 +1,29 @@ +// 환율 정보 취득 for Node.js + +// 환율 API URL +var API = 'http://api.aoikujira.com/kawase/get.php?code=USD&format=json'; + +// 모듈 로드 +var request = require('request'); +var fs = require('fs'); + +// 웹 API 요청 +request(API, function(err, res, body) { + // HTTP 에러 체크 + if (err || res.statusCode != 200) { + console.log("ERROR", err); return; + } + + // JSON을 JS 객체로 변환 + var result = JSON.stringify(body); + var krw = result['krw']; + + // 환율을 파일에 저장(파일명에는 날짜 표기) + var t = new Date(); + var fname = "USD_KRW_" + + t.getFullYear() + '-' + (t.getMonth()+1) + + '-' + t.getDay() + '.txt'; + var text = '1usd = ' + krw + 'krw'; + console.log(text); + fs.writeFile(fname, text); +}) From 3e23c3ab11a20f58f30f61f3210464037429fced Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Fri, 10 Feb 2017 22:15:02 +0900 Subject: [PATCH 06/23] =?UTF-8?q?3=20=EB=A1=9C=EA=B7=B8=EC=9D=B8=EC=9D=B4?= =?UTF-8?q?=20=ED=95=84=EC=9A=94=ED=95=9C=20=EC=9B=B9=EC=82=AC=EC=9D=B4?= =?UTF-8?q?=ED=8A=B8=20=ED=81=AC=EB=A1=A4=EB=A7=81=20-=201=20PhantomJS?= =?UTF-8?q?=EC=99=80=20CasperJS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jpub-web-crawling-technique/03/.gitignore | 2 ++ .../03/01-phantomjs/01-getTitle.js | 13 ++++++++ .../03/01-phantomjs/02-screenshot.js | 18 +++++++++++ .../03/01-phantomjs/03-flickrShot.js | 26 ++++++++++++++++ .../03/01-phantomjs/04-iphoneShot.js | 23 ++++++++++++++ .../03/01-phantomjs/05-shot-tool.js | 30 +++++++++++++++++++ .../03/package.json | 19 ++++++++++++ 7 files changed, 131 insertions(+) create mode 100644 node/jpub-web-crawling-technique/03/.gitignore create mode 100644 node/jpub-web-crawling-technique/03/01-phantomjs/01-getTitle.js create mode 100644 node/jpub-web-crawling-technique/03/01-phantomjs/02-screenshot.js create mode 100644 node/jpub-web-crawling-technique/03/01-phantomjs/03-flickrShot.js create mode 100644 node/jpub-web-crawling-technique/03/01-phantomjs/04-iphoneShot.js create mode 100644 node/jpub-web-crawling-technique/03/01-phantomjs/05-shot-tool.js create mode 100644 node/jpub-web-crawling-technique/03/package.json diff --git a/node/jpub-web-crawling-technique/03/.gitignore b/node/jpub-web-crawling-technique/03/.gitignore new file mode 100644 index 0000000000..71711a5185 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/.gitignore @@ -0,0 +1,2 @@ +/node_modules/ +*/*.jpg diff --git a/node/jpub-web-crawling-technique/03/01-phantomjs/01-getTitle.js b/node/jpub-web-crawling-technique/03/01-phantomjs/01-getTitle.js new file mode 100644 index 0000000000..815014d58f --- /dev/null +++ b/node/jpub-web-crawling-technique/03/01-phantomjs/01-getTitle.js @@ -0,0 +1,13 @@ +// 웹사이트의 타이틀을 표시하는 프로그램 +var TARGET_URL = 'http://jpub.tistory.com'; + +// CasperJS 객체 생성 +var casper = require('casper').create(); + +// 웹사이트 열기 +casper.start(TARGET_URL, function() { + // 타이틀 출력 + this.echo(casper.getTitle()); +}); + +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/01-phantomjs/02-screenshot.js b/node/jpub-web-crawling-technique/03/01-phantomjs/02-screenshot.js new file mode 100644 index 0000000000..1e07657124 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/01-phantomjs/02-screenshot.js @@ -0,0 +1,18 @@ +// CasperJS 화면 캡처 프로그램 + +// Casper 객체 생성 +var casper = require('casper').create(); + +// 개시 +casper.start(); + +// 페이지 열기 +casper.open('http://jpub.tistory.com/'); + +// 스크린샷 수행 +casper.then(function() { + casper.capture('screenshot.jpg'); +}); + +// 실행 +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/01-phantomjs/03-flickrShot.js b/node/jpub-web-crawling-technique/03/01-phantomjs/03-flickrShot.js new file mode 100644 index 0000000000..4a8bedcf7a --- /dev/null +++ b/node/jpub-web-crawling-technique/03/01-phantomjs/03-flickrShot.js @@ -0,0 +1,26 @@ +// 플리커 검색 결과를 캡처 for CasperJS +// CasperJS 객체 생성 +var casper = require('casper').create(); + +// CasperJS 처리 개시 +casper.start(); + +// 화면 사이트 설정 +casper.viewport(1400, 800) + +// UserAgent 설정 +casper.userAgent('User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'); + +// 플리커에서 고양이로 검색 +var text = encodeURIComponent('고양이'); +casper.open('https://www.flickr.com/search/?text=' + text); + +// 화면 캡처 +casper.then(function() { + this.capture('flickr-cat.jpg', { + top: 0, left: 0, width: 1400, height: 800 + }); +}); + +// 실행 개시 +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/01-phantomjs/04-iphoneShot.js b/node/jpub-web-crawling-technique/03/01-phantomjs/04-iphoneShot.js new file mode 100644 index 0000000000..468541a0b4 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/01-phantomjs/04-iphoneShot.js @@ -0,0 +1,23 @@ +// 아이폰인 척 하고 웹사이트 캡처 for CasperJS + +var TARGET_URL = 'http://jpub.tistory.com'; + +// Casper 생성 +var casper = require('casper').create(); +casper.start(); + +// 아이폰인 척 하기 +casper.useragent('Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53'); + +// 화면 사이즈 지정 +casper.viewport(750, 1334); + +casper.open(TARGET_URL); + +// 화면 캡처 +casper.then(function() { + this.capture('iphoneshot.jpg'); +}); + +// 실행 +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/01-phantomjs/05-shot-tool.js b/node/jpub-web-crawling-technique/03/01-phantomjs/05-shot-tool.js new file mode 100644 index 0000000000..d29a1db5cb --- /dev/null +++ b/node/jpub-web-crawling-technique/03/01-phantomjs/05-shot-tool.js @@ -0,0 +1,30 @@ +// 커맨드 라인 인자로 지정한 웹 페이지를 캡처 for CasperJS + +var casper = require('casper').create(); +var utils = require('utils'); + +// 인자 얻기 +var args = casper.cli.args; +if (args.length < 1) { + // 사용법 표시 + casper.echo('USES:'); + casper.echo('shot-tool URL [savepath]'); + casper.exit(); +} + +var savepath = 'casper-shot.jpg'; +var url = args[0]; +if (args.length >= 2) { + savepath = args[1]; +} + +// CasperJS 처리 개시 +casper.start(); +casper.viewport(1024, 768); +casper.open(url); +casper.then(function() { + this.capture(savepath, { + top: 0, left: 0, width: 1024, height: 768 + }); +}); +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/package.json b/node/jpub-web-crawling-technique/03/package.json new file mode 100644 index 0000000000..6aa15f1256 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/package.json @@ -0,0 +1,19 @@ +{ + "name": "jpub-web-crawling-technique", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+ssh://git@github.com/neverlish/neverlish.git" + }, + "author": "neverlish", + "license": "ISC", + "bugs": { + "url": "https://github.com/neverlish/neverlish/issues" + }, + "homepage": "https://github.com/neverlish/neverlish#readme" +} From 227fef1f1ada93d1ced31e6da5827847275519d0 Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Sat, 11 Feb 2017 01:38:13 +0900 Subject: [PATCH 07/23] =?UTF-8?q?3=20-=202=20=EB=A1=9C=EA=B7=B8=EC=9D=B8?= =?UTF-8?q?=20=ED=9B=84=EC=9D=98=20=EB=8D=B0=EC=9D=B4=ED=84=B0=EB=A5=BC=20?= =?UTF-8?q?=EB=8B=A4=EC=9A=B4=EB=A1=9C=EB=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../03/02-login/01-login.js | 37 +++++++++++++++++++ .../03/02-login/02-login-click.js | 37 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 node/jpub-web-crawling-technique/03/02-login/01-login.js create mode 100644 node/jpub-web-crawling-technique/03/02-login/02-login-click.js diff --git a/node/jpub-web-crawling-technique/03/02-login/01-login.js b/node/jpub-web-crawling-technique/03/02-login/01-login.js new file mode 100644 index 0000000000..ee608d3d71 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/02-login/01-login.js @@ -0,0 +1,37 @@ +var casper = require('casper').create({verbose: true}); // 인자로 logLevel: 'debug'를 추가하면 디버그 로그도 함께 나옴 + +// URL 및 로그인 정보 변수 +var url = 'http://neverlish.tistory.com/admin/center/'; +var id = 'neverlish@gmail.com'; +var password = '1a2s3d4f5g'; + +casper.start(); + +casper.open(url); + +// Form Submit +casper.then(function() { + casper.fill('#authForm', + { + loginId: id, + password: password + }, true); +}); + +// 로그인 후 수행 +casper.then(function() { + var getComment = function() { + // 페이지 내의 document 객체 사용 + return document.querySelector('#blogInfo > ul > li:nth-child(3) > span.day').innerText; + }; + console.log('새 댓글 수: ' + this.evaluate(getComment)); // evaluate() 메소드 +}); + +casper.then(function() { + var getGuestBook = function() { + return document.querySelector('#blogInfo > ul > li:nth-child(4) > span.day').innerText; + }; + console.log('새 방명록 수: ' + this.evaluate(getGuestBook)); +}); + +casper.run(); diff --git a/node/jpub-web-crawling-technique/03/02-login/02-login-click.js b/node/jpub-web-crawling-technique/03/02-login/02-login-click.js new file mode 100644 index 0000000000..323fcd071a --- /dev/null +++ b/node/jpub-web-crawling-technique/03/02-login/02-login-click.js @@ -0,0 +1,37 @@ +var casper = require('casper').create({verbose: true}); + +// URL 및 로그인 정보 변수 +var url = 'http://neverlish.tistory.com/admin/center;'; +var id = 'neverlish@gmail.com'; +var password = '1a2s3d4f5g'; + +casper.start(); + +casper.open(url); + +// 로그인 +casper.then(function() { + casper.fill('#authForm', + { + loginId: id, + password: password + }, true); +}); + +// 마우스 클릭 +casper.then(function() { + // 선택자에 해당하는 요소 확인 후 클릭 + var path = '#blogInfo > ul > li:nth-child(2) > span.txt > a'; + if (casper.exists(path)) { + casper.mouseEvent('click', path); + } + casper.wait(3000); +}); + +casper.then(function() { + casper.capture('capture.jpg', { + top: 0, left: 0, width: 1024, height: 768 + }); +}); + +casper.run(); From c9961697a4ad72c62623ea005f9223528911ac37 Mon Sep 17 00:00:00 2001 From: Jinho Hyeon Date: Sat, 11 Feb 2017 01:48:56 +0900 Subject: [PATCH 08/23] =?UTF-8?q?3=20-=203=20DOM=20=ED=8C=8C=EC=8B=B1=20?= =?UTF-8?q?=EB=B0=A9=EB=B2=95=EA=B3=BC=20CSS=20=EC=84=A0=ED=83=9D=EC=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../03/03-dom/01-test-id.html | 18 +++++++++++++ .../03/03-dom/02-test-attr.html | 26 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 node/jpub-web-crawling-technique/03/03-dom/01-test-id.html create mode 100644 node/jpub-web-crawling-technique/03/03-dom/02-test-attr.html diff --git a/node/jpub-web-crawling-technique/03/03-dom/01-test-id.html b/node/jpub-web-crawling-technique/03/03-dom/01-test-id.html new file mode 100644 index 0000000000..7e58132463 --- /dev/null +++ b/node/jpub-web-crawling-technique/03/03-dom/01-test-id.html @@ -0,0 +1,18 @@ +
    +
  • Genesis
  • +
  • Exodus
  • +
  • Leviticus
  • +
  • Numbers
  • +
  • Deuteronomy
  • +
+ + + + diff --git a/node/jpub-web-crawling-technique/03/03-dom/02-test-attr.html b/node/jpub-web-crawling-technique/03/03-dom/02-test-attr.html new file mode 100644 index 0000000000..ea55580e8c --- /dev/null +++ b/node/jpub-web-crawling-technique/03/03-dom/02-test-attr.html @@ -0,0 +1,26 @@ +
+

과일이나 야채

+
    +
  • 사과
  • +
  • 포도
  • +
  • 레몬
  • +
  • 오렌지
  • +
+
    +
  • +
  • 아보카도
  • +
  • 당근
  • +
  • 연근
  • +
+
+ + +