From e58775e71ccfe962692550287211cc443f26be43 Mon Sep 17 00:00:00 2001 From: Ryohei Ueda Date: Sun, 2 Nov 2014 22:30:51 +0900 Subject: [PATCH] =?UTF-8?q?collect=5Fsamples.py=E3=81=AE=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ダウンロードの進捗状況を表示 * imgをcloseしないように --- collect_samples.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/collect_samples.py b/collect_samples.py index 778ec98..b695042 100644 --- a/collect_samples.py +++ b/collect_samples.py @@ -21,9 +21,8 @@ def getUrls( word, key, skip=0, urls=[] ): if skip: params.update( { '$skip': str( skip ) } ) - results = requests.get( prefix, auth=( key, key ), params=params ) - results = results.json - + results = requests.get( prefix, auth=( key, key ), params=params) + results = results.json() for result in results['d']['results']: typ = result[ 'ContentType' ] if typ== 'image/jpg' or typ == 'image/jpeg': @@ -35,15 +34,24 @@ def getUrls( word, key, skip=0, urls=[] ): return urls def saveImages( urls, dir ): + counter = 0 for url in urls: try: - img = requests.get( url ).content - f = open( os.path.join( dir, os.path.basename( url ) ), 'wb' ) - f.write( img ) - img.close() - f.close() - except: + counter = counter + 1 + print "writing [%d/%d]: %s" % (counter, len(urls), url) + fname = os.path.join( dir, os.path.basename( url ) ) + if not os.path.exists(fname): + img = requests.get(url, timeout=5).content + f = open(fname , 'wb' ) + f.write( img ) + f.close() + except Exception, e: + print "failed to get " + print url + print e.message pass + except requests.exceptions.ReadTimeout: + print "timeout" if __name__ == '__main__': word = settings.word @@ -52,4 +60,3 @@ def saveImages( urls, dir ): urls = getUrls( word, key ) saveImages( urls, dir ) -