From 57e85dbc05d9d60f0969ed5f952d80be056f855b Mon Sep 17 00:00:00 2001 From: fireattack Date: Sun, 12 Feb 2023 10:36:42 +0800 Subject: [PATCH 01/17] Update .gitignore and cmd.py --- .gitignore | 3 +++ gfile/cmd.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 68bc17f..a0e97ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ __pycache__/ *.py[cod] *$py.class +.vscode/ + +test* # C extensions *.so diff --git a/gfile/cmd.py b/gfile/cmd.py index 3dd0ae2..d41e8a4 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -12,7 +12,7 @@ class Action(Enum): upload = 'upload' def __str__(self): return self.value - + def main(): @@ -20,7 +20,7 @@ def main(): parser.add_argument('action', type=Action, choices=list(Action), help='upload or download') parser.add_argument('uri', help='filename to upload or url to download') parser.add_argument('-p', '--hide-progress', dest='progress', action='store_false', default=True, help='hide progress bar') - parser.add_argument('-o', '--output', dest='output file', type=str, default=None, help='hide progress bar') + # parser.add_argument('-o', '--output', dest='output file', type=str, default=None, help='hide progress bar') #not implemented parser.add_argument('-n', '--thread-num', dest='thread_num', default=int(4), type=int, help='number of threads used for upload (can incease speed)') parser.add_argument('-s', '--chunk-size', dest='chunk_size', type=int, help='gigafile allowed chunk size per upload', default=1024*1024*100) parser.add_argument('-m', '--copy-size', dest='chunk_copy_size', type=int, help='specifies size to copy the main file into pieces (the size loaded in RAM)', default=1024*1024) @@ -30,9 +30,9 @@ def main(): gf = GFile(**args.__dict__) if args.action == Action.download: gf.download(args.chunk_copy_size, args.progress) - + else: print(gf.upload().get_download_page()) - + if __name__ == "__main__": main() From e3ae2695d8d056ca348ea0ac614af66993059574 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sun, 12 Feb 2023 10:47:21 +0800 Subject: [PATCH 02/17] Update --- gfile/cmd.py | 4 +- gfile/gfile.py | 226 +++++++++++++++++++++++++++---------------------- 2 files changed, 127 insertions(+), 103 deletions(-) diff --git a/gfile/cmd.py b/gfile/cmd.py index d41e8a4..2eb5fe6 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -13,15 +13,13 @@ class Action(Enum): def __str__(self): return self.value - - def main(): parser = argparse.ArgumentParser(prog='Gfile') parser.add_argument('action', type=Action, choices=list(Action), help='upload or download') parser.add_argument('uri', help='filename to upload or url to download') parser.add_argument('-p', '--hide-progress', dest='progress', action='store_false', default=True, help='hide progress bar') # parser.add_argument('-o', '--output', dest='output file', type=str, default=None, help='hide progress bar') #not implemented - parser.add_argument('-n', '--thread-num', dest='thread_num', default=int(4), type=int, help='number of threads used for upload (can incease speed)') + parser.add_argument('-n', '--thread-num', dest='thread_num', default=8, type=int, help='number of threads used for upload (can incease speed)') parser.add_argument('-s', '--chunk-size', dest='chunk_size', type=int, help='gigafile allowed chunk size per upload', default=1024*1024*100) parser.add_argument('-m', '--copy-size', dest='chunk_copy_size', type=int, help='specifies size to copy the main file into pieces (the size loaded in RAM)', default=1024*1024) diff --git a/gfile/gfile.py b/gfile/gfile.py index 8babd59..3e28bb2 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -1,21 +1,66 @@ -from math import ceil -import os +import concurrent.futures import re -import sys import tempfile -from threading import Lock, Thread import uuid -import requests -from requests_toolbelt import MultipartEncoderMonitor -import requests as r +from math import ceil +from pathlib import Path from requests_toolbelt.multipart import encoder from tqdm import tqdm - +import time + + +def requests_retry_session( + retries=5, + backoff_factor=0.2, + status_forcelist=None, # (500, 502, 504) + session=None, +): + import requests + from requests.adapters import HTTPAdapter + from urllib3.util.retry import Retry + + session = session or requests.Session() + retry = Retry( + total=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +def split_file(input_file, out, target_size=None, start=0, chunk_copy_size=1024*1024): + input_file = Path(input_file) + size = 0 + + input_size = input_file.stat().st_size + if target_size is None: + output_size = input_size - start + else: + output_size = min( target_size, input_size - start ) + + # print('input_size:', input_size) + # print('output_size:', output_size) + + with open(input_file, 'rb') as f: + f.seek(start) + while True: + # print(f'{size / output_size * 100:.2f}%', end='\r') + if size == output_size: break + if size > output_size: + raise Exception(f'Size ({size}) is larger than {target_size} bytes!') + current_chunk_size = min(chunk_copy_size, output_size - size) + chunk = f.read(current_chunk_size) + if not chunk: break + size += len(chunk) + out.write(chunk) class GFile: - def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*100, chunk_copy_size=1024*1024, **kwargs) -> None: + def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, chunk_copy_size=1024*1024, **kwargs) -> None: self.uri = uri self.chunk_size = chunk_size self.chunk_copy_size = chunk_copy_size @@ -23,104 +68,85 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*100, self.progress = progress self.data = None self.pbar: tqdm = None - self.session = requests.Session() + self.session = requests_retry_session() self.index = 0 + self.cookies = None + + + def upload_chunk(self, chunk_no, chunks): + + chunk_id = f'chunk {chunk_no}' + if self.pbar: + self.pbar.desc = chunk_id + with tempfile.NamedTemporaryFile() as f: + split_file(self.uri, f, self.chunk_size, start=chunk_no * self.chunk_size, chunk_copy_size=self.chunk_copy_size) + chunk_size = f.tell() + # print('chunk size:', chunk_size) + f.seek(0) + fields = { + "id": self.token, + "name": Path(self.uri).name, + "chunk": str(chunk_no), + "chunks": str(chunks), + "lifetime": "100", + "file": ("blob", f, "application/octet-stream"), + } + form = encoder.MultipartEncoder(fields) + + headers = { + "content-type": form.content_type, + } + # print("Session gfsid:", self.session.cookies['gfsid']) + # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') + resp = self.session.post(f"https://{self.server}/upload_chunk.php", headers=headers, data=form) + if self.pbar: + self.pbar.update(chunk_size) + # print("Session gfsid after uploading:", self.session.cookies['gfsid']) + # print('resp', resp.cookies.__dict__) + resp_data = resp.json() + # print(resp_data) + if 'url' in resp_data: + self.data = resp_data + if 'status' not in resp_data or resp_data['status']: + print(resp_data) + self.failed = True - def upload_chunk(self, chunks): - self.lock.acquire() - with open(self.uri, 'rb') as ff: - while not self.failed and self.index < chunks: - chunk_id = f'chunk {self.index}' - if self.pbar: - self.pbar.desc = chunk_id - with tempfile.NamedTemporaryFile() as f: - i = 0 - chunk = ff.read(self.chunk_copy_size) - while i < self.chunk_size and chunk: - f.write(chunk) - i += self.chunk_copy_size - chunk = ff.read(self.chunk_copy_size) - - f.seek(0) - - fields = { - "id": self.token, - "name": os.path.basename(self.uri), - "chunk": str(self.index), - "chunks": str(chunks), - "lifetime": "7", - "file": (self.uri, f, "application/octet-stream"), - } - # print(fields) - - released = False - - self.index += 1 - - - def progress(monitor: MultipartEncoderMonitor): - nonlocal released - self.pbar.update(monitor.bytes_read - monitor.prog) - monitor.prog = monitor.bytes_read - if self.failed: self.session.close() - if not released and monitor.bytes_read > i/10: - self.lock.release() - released = True - - form = encoder.MultipartEncoder(fields) - if self.pbar: - - form = encoder.MultipartEncoderMonitor(form, progress) - setattr(form, 'prog', 0) - server = re.search( - r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] - headers = { - "content-type": form.content_type, - } - resp = self.session.post( - f"https://{server}/upload_chunk.php", headers=headers, data=form).json() - - if 'url' in resp: - self.data = resp - - if 'status' not in resp or resp['status']: - print(resp) - self.failed = True - if self.failed: break - self.lock.acquire() - if self.lock.locked(): self.lock.release() - - def upload(self): self.token = uuid.uuid1().hex self.pbar = None self.failed = False self.index = 0 - self.lock = Lock() - size = os.path.getsize(self.uri) + assert Path(self.uri).exists() + size = Path(self.uri).stat().st_size chunks = ceil(size / self.chunk_size) + # print(f'Total chunks: {chunks}') if self.progress: self.pbar = tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024) - self.session = requests.Session() - # self.session.get('https://gigafile.nu/') - threads = [] - for _ in range(self.thread_num): - t = Thread(target=self.upload_chunk, args=(chunks,)) - threads.append(t) - t.start() - - try: - for t in threads: - t.join() - + self.session = requests_retry_session() + self.server = re.search(r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] + + # upload the first chunk + self.upload_chunk(0, chunks) + time.sleep(1) + with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_num) as ex: + for i in range(1, chunks - 1): + ex.submit(self.upload_chunk, i, chunks) + if self.failed: + print('Failed!') + return + time.sleep(1) + print('upload the last chunk') + self.upload_chunk(chunks - 1, chunks) + + if self.pbar: self.pbar.close() - if 'url' not in self.data: - print('something went wrong', self.data) - except KeyboardInterrupt: - self.pbar.close() - self.failed = True - print('Aborted! cleaning...') + if 'url' not in self.data: + print('something went wrong', self.data) + # except KeyboardInterrupt: + # self.pbar.close() + # self.failed = True + # print('Aborted! cleaning...') return self def get_download_page(self): return self.data and self.data['url'] @@ -128,7 +154,7 @@ def get_file_id(self): return self.data and self.data['filename'] def get_download(self): _data: dict[str, str] = self.data - if not os.path.exists(self.uri): + if not Path(self.uri).exists(): data = re.search(r'^https?:\/\/\d+?\.gigafile\.nu\/([a-z0-9-]+)$', self.uri) if data: _data = {'url': self.uri, 'filename': data[1]} @@ -138,22 +164,22 @@ def get_download(self): if not _data: return ValueError('You specified no file to upload nor to download') - sess = requests.Session() + sess = requests_retry_session() sess.get(_data['url']) return (_data['url'].replace(_data['filename'], 'download.php?file='+_data['filename']), sess.cookies) def download(self, copy_size=1024*1024, progress=True, filename=None): url, cookies = self.get_download() if not filename: - headers = r.head(url, cookies=cookies).headers + headers = requests_retry_session().head(url, cookies=cookies).headers filesize = int(headers['Content-Length']) filename = re.search(r'filename="(.+?)";', headers['Content-Disposition'])[1] filename = re.sub(r'\\|\/|:|\*|\?|"|<|>|\|', '_', filename) if progress: pbar = tqdm(total=filesize, unit='B', unit_scale=True, desc=filename) - + with open(filename, 'wb') as f: - with r.get(url, cookies=cookies, stream=True) as req: + with requests_retry_session().get(url, cookies=cookies, stream=True) as req: req.raise_for_status() for chunk in req.iter_content(chunk_size=copy_size): f.write(chunk) From bfed7ef32558df01cd5784db7c46c2696b06ffa3 Mon Sep 17 00:00:00 2001 From: fireattack Date: Mon, 13 Feb 2023 10:28:40 +0800 Subject: [PATCH 03/17] Multi-thread: make sure each thread is finished in order --- gfile/gfile.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/gfile/gfile.py b/gfile/gfile.py index 3e28bb2..dedaa84 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -6,6 +6,7 @@ from pathlib import Path from requests_toolbelt.multipart import encoder +from requests_toolbelt.streaming_iterator import StreamingIterator from tqdm import tqdm import time @@ -71,6 +72,7 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, c self.session = requests_retry_session() self.index = 0 self.cookies = None + self.current_chunk = 0 def upload_chunk(self, chunk_no, chunks): @@ -91,14 +93,30 @@ def upload_chunk(self, chunk_no, chunks): "lifetime": "100", "file": ("blob", f, "application/octet-stream"), } - form = encoder.MultipartEncoder(fields) + form = encoder.MultipartEncoder(fields) + form_str = form.to_string() + size = len(form_str) + def gen(): + offset = 0 + while True: + if offset < size: + yield form_str[offset:offset+1024] + offset += 1024 + else: + if chunk_no != self.current_chunk: + time.sleep(0.1) + else: + break + + streamer = StreamingIterator(size, gen()) headers = { "content-type": form.content_type, } # print("Session gfsid:", self.session.cookies['gfsid']) # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') - resp = self.session.post(f"https://{self.server}/upload_chunk.php", headers=headers, data=form) + resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) + self.current_chunk += 1 if self.pbar: self.pbar.update(chunk_size) # print("Session gfsid after uploading:", self.session.cookies['gfsid']) @@ -120,24 +138,31 @@ def upload(self): assert Path(self.uri).exists() size = Path(self.uri).stat().st_size chunks = ceil(size / self.chunk_size) - # print(f'Total chunks: {chunks}') + print(f'Total chunks: {chunks}') + if self.progress: - self.pbar = tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024) + self.pbar = tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024, ncols=100) + self.session = requests_retry_session() self.server = re.search(r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] + # upload the first chunk self.upload_chunk(0, chunks) - time.sleep(1) + + # for i in range(1, chunks-1): + # self.upload_chunk(i, chunks) + with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_num) as ex: for i in range(1, chunks - 1): ex.submit(self.upload_chunk, i, chunks) if self.failed: print('Failed!') return - time.sleep(1) - print('upload the last chunk') - self.upload_chunk(chunks - 1, chunks) + + if chunks > 1: + print('\nupload the last chunk in single thread') + self.upload_chunk(chunks - 1, chunks) if self.pbar: self.pbar.close() From cb834fe4b47922d5ff164fb0a41c509d30e20d39 Mon Sep 17 00:00:00 2001 From: fireattack Date: Mon, 13 Feb 2023 11:50:36 +0800 Subject: [PATCH 04/17] Delete temp file as soon as possible and other fix --- gfile/gfile.py | 89 ++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/gfile/gfile.py b/gfile/gfile.py index dedaa84..8d9cd6f 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -74,13 +74,9 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, c self.cookies = None self.current_chunk = 0 - def upload_chunk(self, chunk_no, chunks): - chunk_id = f'chunk {chunk_no}' - if self.pbar: - self.pbar.desc = chunk_id - with tempfile.NamedTemporaryFile() as f: + with tempfile.NamedTemporaryFile(dir='.') as f: split_file(self.uri, f, self.chunk_size, start=chunk_no * self.chunk_size, chunk_copy_size=self.chunk_copy_size) chunk_size = f.tell() # print('chunk size:', chunk_size) @@ -93,42 +89,44 @@ def upload_chunk(self, chunk_no, chunks): "lifetime": "100", "file": ("blob", f, "application/octet-stream"), } - form = encoder.MultipartEncoder(fields) - form_str = form.to_string() - size = len(form_str) - def gen(): - offset = 0 - while True: - if offset < size: - yield form_str[offset:offset+1024] - offset += 1024 - else: - if chunk_no != self.current_chunk: - time.sleep(0.1) - else: - break - - streamer = StreamingIterator(size, gen()) headers = { "content-type": form.content_type, } - # print("Session gfsid:", self.session.cookies['gfsid']) - # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') - resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) - self.current_chunk += 1 - if self.pbar: - self.pbar.update(chunk_size) - # print("Session gfsid after uploading:", self.session.cookies['gfsid']) - # print('resp', resp.cookies.__dict__) - resp_data = resp.json() - # print(resp_data) - if 'url' in resp_data: - self.data = resp_data - if 'status' not in resp_data or resp_data['status']: - print(resp_data) - self.failed = True + form_str = form.to_string() + + size = len(form_str) + def gen(): + offset = 0 + while True: + if offset < size: + yield form_str[offset:offset+1024] + offset += 1024 + else: + if chunk_no != self.current_chunk: + time.sleep(0.01) + else: + time.sleep(0.1) + break + streamer = StreamingIterator(size, gen()) + + # print("Session gfsid:", self.session.cookies['gfsid']) + # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') + resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) + self.current_chunk += 1 + if self.pbar: + self.pbar.desc = f'Finished {chunk_no + 1}/{chunks} chunks' + self.pbar.update(chunk_size) + # print("Session gfsid after uploading:", self.session.cookies['gfsid']) + # print('resp', resp.cookies.__dict__) + resp_data = resp.json() + # print(resp_data) + if 'url' in resp_data: + self.data = resp_data + if 'status' not in resp_data or resp_data['status']: + print(resp_data) + self.failed = True def upload(self): self.token = uuid.uuid1().hex @@ -152,13 +150,20 @@ def upload(self): # for i in range(1, chunks-1): # self.upload_chunk(i, chunks) - with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_num) as ex: - for i in range(1, chunks - 1): - ex.submit(self.upload_chunk, i, chunks) - if self.failed: - print('Failed!') - return + futures = {ex.submit(self.upload_chunk, i, chunks): i for i in range(1, chunks - 1)} + try: + for future in concurrent.futures.as_completed(futures): + if self.failed: + print('Failed!') + for future in futures: + future.cancel() + return + except KeyboardInterrupt: + print('\nUser cancelled the operation.') + for future in futures: + future.cancel() + return if chunks > 1: print('\nupload the last chunk in single thread') From f68ebd19c2303d42a2b4ca8c7d6cf4519e3ea9c7 Mon Sep 17 00:00:00 2001 From: fireattack Date: Mon, 13 Feb 2023 16:11:09 +0800 Subject: [PATCH 05/17] Fix double memory usage; fix download filename parsing --- gfile/gfile.py | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/gfile/gfile.py b/gfile/gfile.py index 8d9cd6f..632dcfa 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -1,15 +1,13 @@ import concurrent.futures import re -import tempfile import uuid from math import ceil from pathlib import Path - -from requests_toolbelt.multipart import encoder -from requests_toolbelt.streaming_iterator import StreamingIterator +import io +from requests_toolbelt import MultipartEncoder, StreamingIterator from tqdm import tqdm import time - +from urllib.parse import unquote def requests_retry_session( retries=5, @@ -41,10 +39,7 @@ def split_file(input_file, out, target_size=None, start=0, chunk_copy_size=1024* if target_size is None: output_size = input_size - start else: - output_size = min( target_size, input_size - start ) - - # print('input_size:', input_size) - # print('output_size:', output_size) + output_size = min( target_size, input_size - start) with open(input_file, 'rb') as f: f.seek(start) @@ -74,12 +69,24 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, c self.cookies = None self.current_chunk = 0 + def upload_chunk(self, chunk_no, chunks): + # import tracemalloc + + # tracemalloc.start() + # prev = tracemalloc.get_traced_memory()[0] - with tempfile.NamedTemporaryFile(dir='.') as f: + # def memo(text=''): + # nonlocal prev + + # current = tracemalloc.get_traced_memory()[0] + # print(f'Memory change at {text}', current - prev) + # prev = current + + # memo('Before load') + with io.BytesIO() as f: split_file(self.uri, f, self.chunk_size, start=chunk_no * self.chunk_size, chunk_copy_size=self.chunk_copy_size) chunk_size = f.tell() - # print('chunk size:', chunk_size) f.seek(0) fields = { "id": self.token, @@ -89,18 +96,21 @@ def upload_chunk(self, chunk_no, chunks): "lifetime": "100", "file": ("blob", f, "application/octet-stream"), } - form = encoder.MultipartEncoder(fields) + form_data = MultipartEncoder(fields) headers = { - "content-type": form.content_type, + "content-type": form_data.content_type, } - form_str = form.to_string() + # convert the form-data into a binary string, this way we can control throttle its read() behavior + form_data_binary = form_data.to_string() + del form_data + + size = len(form_data_binary) - size = len(form_str) def gen(): offset = 0 while True: if offset < size: - yield form_str[offset:offset+1024] + yield form_data_binary[offset:offset+1024] offset += 1024 else: if chunk_no != self.current_chunk: @@ -144,7 +154,6 @@ def upload(self): self.session = requests_retry_session() self.server = re.search(r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] - # upload the first chunk self.upload_chunk(0, chunks) @@ -203,10 +212,13 @@ def download(self, copy_size=1024*1024, progress=True, filename=None): if not filename: headers = requests_retry_session().head(url, cookies=cookies).headers filesize = int(headers['Content-Length']) - filename = re.search(r'filename="(.+?)";', headers['Content-Disposition'])[1] + if "UTF-8''" in headers['Content-Disposition']: + filename = unquote(headers['Content-Disposition'].split("UTF-8''")[-1]) + else: + filename = re.search(r'filename="(.+?)";', headers['Content-Disposition'])[1].encode('iso8859-1','ignore').decode('utf-8', 'ignore') filename = re.sub(r'\\|\/|:|\*|\?|"|<|>|\|', '_', filename) if progress: - pbar = tqdm(total=filesize, unit='B', unit_scale=True, desc=filename) + pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename, ncols=100) with open(filename, 'wb') as f: with requests_retry_session().get(url, cookies=cookies, stream=True) as req: From 9582570f51bd99287836b9edc3679ae6c545bbb5 Mon Sep 17 00:00:00 2001 From: fireattack Date: Mon, 13 Feb 2023 17:06:04 +0800 Subject: [PATCH 06/17] Better output --- gfile/cmd.py | 18 +++++++++++++++--- gfile/gfile.py | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/gfile/cmd.py b/gfile/cmd.py index 2eb5fe6..d5b9917 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -1,9 +1,10 @@ import argparse from enum import Enum -import re +from datetime import datetime +from pathlib import Path +import math -from tqdm import tqdm if __name__ == "__main__": from gfile import GFile else: from .gfile import GFile @@ -13,6 +14,15 @@ class Action(Enum): def __str__(self): return self.value + +def convert_size(size_bytes): + if size_bytes == 0: + return "0B" + units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + return f"{size_bytes/p:.02f} {units[i]}" + def main(): parser = argparse.ArgumentParser(prog='Gfile') parser.add_argument('action', type=Action, choices=list(Action), help='upload or download') @@ -30,7 +40,9 @@ def main(): gf.download(args.chunk_copy_size, args.progress) else: - print(gf.upload().get_download_page()) + url = gf.upload().get_download_page() + print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, filename: {gf.uri}, size: {convert_size(Path(gf.uri).stat().st_size)}") + print(url) if __name__ == "__main__": main() diff --git a/gfile/gfile.py b/gfile/gfile.py index 632dcfa..e2148be 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -175,7 +175,7 @@ def upload(self): return if chunks > 1: - print('\nupload the last chunk in single thread') + # print('\nupload the last chunk in single thread') self.upload_chunk(chunks - 1, chunks) if self.pbar: From 944f03bf9a6347f6757da4cef9dfb1375c7a9612 Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 14 Feb 2023 14:05:27 +0800 Subject: [PATCH 07/17] Refactor --- gfile/cmd.py | 27 +++-------- gfile/gfile.py | 120 ++++++++++++++++++++++++++++--------------------- 2 files changed, 74 insertions(+), 73 deletions(-) diff --git a/gfile/cmd.py b/gfile/cmd.py index d5b9917..a845825 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -1,9 +1,6 @@ import argparse from enum import Enum -from datetime import datetime -from pathlib import Path -import math if __name__ == "__main__": from gfile import GFile else: from .gfile import GFile @@ -14,35 +11,23 @@ class Action(Enum): def __str__(self): return self.value - -def convert_size(size_bytes): - if size_bytes == 0: - return "0B" - units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - i = int(math.floor(math.log(size_bytes, 1024))) - p = math.pow(1024, i) - return f"{size_bytes/p:.02f} {units[i]}" - def main(): parser = argparse.ArgumentParser(prog='Gfile') parser.add_argument('action', type=Action, choices=list(Action), help='upload or download') parser.add_argument('uri', help='filename to upload or url to download') parser.add_argument('-p', '--hide-progress', dest='progress', action='store_false', default=True, help='hide progress bar') - # parser.add_argument('-o', '--output', dest='output file', type=str, default=None, help='hide progress bar') #not implemented - parser.add_argument('-n', '--thread-num', dest='thread_num', default=8, type=int, help='number of threads used for upload (can incease speed)') - parser.add_argument('-s', '--chunk-size', dest='chunk_size', type=int, help='gigafile allowed chunk size per upload', default=1024*1024*100) - parser.add_argument('-m', '--copy-size', dest='chunk_copy_size', type=int, help='specifies size to copy the main file into pieces (the size loaded in RAM)', default=1024*1024) + parser.add_argument('-o', '--output', type=str, default=None, help='output filename for download') + parser.add_argument('-n', '--thread-num', dest='thread_num', default=8, type=int, help='number of threads used for upload [default: 8]') + parser.add_argument('-s', '--chunk-size', dest='chunk_size', default="100MB", help='chunk size per upload in bytes; note: chunk_size*thread will be loaded into memory [default: 100MB]') + parser.add_argument('-m', '--copy-size', dest='chunk_copy_size', default="1MB", help='specifies size to copy the main file into pieces [default: 1MB]') args = parser.parse_args() gf = GFile(**args.__dict__) if args.action == Action.download: - gf.download(args.chunk_copy_size, args.progress) - + gf.download(args.output) else: - url = gf.upload().get_download_page() - print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, filename: {gf.uri}, size: {convert_size(Path(gf.uri).stat().st_size)}") - print(url) + gf.upload() if __name__ == "__main__": main() diff --git a/gfile/gfile.py b/gfile/gfile.py index e2148be..b9b7b53 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -8,6 +8,28 @@ from tqdm import tqdm import time from urllib.parse import unquote +import math + + +def bytes_to_size_str(bytes): + if bytes == 0: + return "0B" + units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.floor(math.log(bytes, 1024))) + p = math.pow(1024, i) + return f"{bytes/p:.02f} {units[i]}" + + +def size_str_to_bytes(size_str): + if isinstance(size_str, int): + return size_str + m = re.search(r'^(?P\d+) ?(?P[KMGTPEZY]i?B)?$', size_str, re.IGNORECASE) + assert m + units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + unit = m['unit'] or 'B' + unit = unit.upper().replace('I', '') + return int(math.pow(1024, units.index(unit)) * int(m['num'])) + def requests_retry_session( retries=5, @@ -55,11 +77,10 @@ def split_file(input_file, out, target_size=None, start=0, chunk_copy_size=1024* out.write(chunk) class GFile: - def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, chunk_copy_size=1024*1024, **kwargs) -> None: self.uri = uri - self.chunk_size = chunk_size - self.chunk_copy_size = chunk_copy_size + self.chunk_size = size_str_to_bytes(chunk_size) + self.chunk_copy_size = size_str_to_bytes(chunk_copy_size) self.thread_num=thread_num self.progress = progress self.data = None @@ -100,7 +121,7 @@ def upload_chunk(self, chunk_no, chunks): headers = { "content-type": form_data.content_type, } - # convert the form-data into a binary string, this way we can control throttle its read() behavior + # convert the form-data into a binary string, this way we can control/throttle its read() behavior form_data_binary = form_data.to_string() del form_data @@ -138,6 +159,7 @@ def gen(): print(resp_data) self.failed = True + def upload(self): self.token = uuid.uuid1().hex self.pbar = None @@ -157,8 +179,7 @@ def upload(self): # upload the first chunk self.upload_chunk(0, chunks) - # for i in range(1, chunks-1): - # self.upload_chunk(i, chunks) + # upload second to second last chunk(s) with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_num) as ex: futures = {ex.submit(self.upload_chunk, i, chunks): i for i in range(1, chunks - 1)} try: @@ -174,56 +195,51 @@ def upload(self): future.cancel() return + # upload last chunk if not already if chunks > 1: # print('\nupload the last chunk in single thread') self.upload_chunk(chunks - 1, chunks) - if self.pbar: - self.pbar.close() + if self.pbar: self.pbar.close() if 'url' not in self.data: - print('something went wrong', self.data) - # except KeyboardInterrupt: - # self.pbar.close() - # self.failed = True - # print('Aborted! cleaning...') - return self - - def get_download_page(self): return self.data and self.data['url'] - def get_file_id(self): return self.data and self.data['filename'] - - def get_download(self): - _data: dict[str, str] = self.data - if not Path(self.uri).exists(): - data = re.search(r'^https?:\/\/\d+?\.gigafile\.nu\/([a-z0-9-]+)$', self.uri) - if data: - _data = {'url': self.uri, 'filename': data[1]} - else: - raise ValueError('URL invalid') - - if not _data: - return ValueError('You specified no file to upload nor to download') - - sess = requests_retry_session() - sess.get(_data['url']) - return (_data['url'].replace(_data['filename'], 'download.php?file='+_data['filename']), sess.cookies) - - def download(self, copy_size=1024*1024, progress=True, filename=None): - url, cookies = self.get_download() - if not filename: - headers = requests_retry_session().head(url, cookies=cookies).headers - filesize = int(headers['Content-Length']) - if "UTF-8''" in headers['Content-Disposition']: - filename = unquote(headers['Content-Disposition'].split("UTF-8''")[-1]) - else: - filename = re.search(r'filename="(.+?)";', headers['Content-Disposition'])[1].encode('iso8859-1','ignore').decode('utf-8', 'ignore') - filename = re.sub(r'\\|\/|:|\*|\?|"|<|>|\|', '_', filename) - if progress: - pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename, ncols=100) - - with open(filename, 'wb') as f: - with requests_retry_session().get(url, cookies=cookies, stream=True) as req: - req.raise_for_status() - for chunk in req.iter_content(chunk_size=copy_size): + print('Something went wrong', self.data) + else: + self.get_download_page() + + + def get_download_page(self): + from datetime import datetime + f = Path(self.uri) + print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, filename: {f.name}, size: {bytes_to_size_str(f.stat().st_size)}") + print(self.data['url']) + + + def download(self, filename=None): + m = re.search(r'^https?:\/\/\d+?\.gigafile\.nu\/([a-z0-9-]+)$', self.uri) + if not m: + print('Invalid URL.') + return + self.session.get(self.uri) # setup cookie + file_id = m[1] + download_url = self.uri.replace(file_id, 'download.php?file=' + file_id) + with self.session.get(download_url, stream=True) as r: + r.raise_for_status() + filesize = int(r.headers['Content-Length']) + if not filename: + content_disp = r.headers['Content-Disposition'] + if "UTF-8''" in content_disp: + filename = unquote(content_disp.split("UTF-8''")[-1]) + else: + filename = re.search(r'filename="(.+?)";', content_disp)[1].encode('iso8859-1','ignore').decode('utf-8', 'ignore') + filename = re.sub(r'[\\/:*?"<>|]', '_', filename) # only sanitize remote filename. User provided are on users' own. + if self.progress: + self.pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename, ncols=100) + with open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=self.chunk_copy_size): f.write(chunk) - if pbar: pbar.update(len(chunk)) + if self.pbar: self.pbar.update(len(chunk)) + if self.pbar: self.pbar.close() + + filesize_downloaded = Path(filename).stat().st_size + print(f'Filesize check: expected: {filesize}; actual: {filesize_downloaded}. {"Succeeded." if filesize==filesize_downloaded else "Failed!"}') return filename From dbecaa41207fc8409cf2738da88e1fa9a84bd964 Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 14 Feb 2023 14:33:37 +0800 Subject: [PATCH 08/17] Fix build --- gfile/__main__.py | 4 ++++ pyproject.toml | 3 +++ setup.py | 8 ++++---- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 gfile/__main__.py create mode 100644 pyproject.toml diff --git a/gfile/__main__.py b/gfile/__main__.py new file mode 100644 index 0000000..bda8a5f --- /dev/null +++ b/gfile/__main__.py @@ -0,0 +1,4 @@ +from . import cmd + +if __name__ == '__main__': + cmd.main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..49986e2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = 'setuptools.build_meta' +requires = ['setuptools'] \ No newline at end of file diff --git a/setup.py b/setup.py index 719a2af..1eb813a 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,11 @@ setup( name='gfile', - version='2.1', + version='3.0', description='A python module to download and upload from gigafile.nu', - author='Sraqzit', - author_email='kingofmestry@gmail.com', - install_requires=['requests==2.25.1', 'requests_toolbelt==0.9.1', 'tqdm==4.61.2'], + author='Sraqzit, firattack', + author_email='kingofmestry@gmail.com, (just use GitHub)', + install_requires=['requests>=2.25.1', 'requests_toolbelt>=0.9.1', 'tqdm>=4.61.2'], requires=[], packages=['gfile'], platforms=["Linux", "Mac OS-X", "Windows", "Unix"], From c4f63d5dd38115c7994612789a5a06096dd6e352 Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 14 Feb 2023 14:55:27 +0800 Subject: [PATCH 09/17] Support more natural file size string --- gfile/cmd.py | 2 +- gfile/gfile.py | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/gfile/cmd.py b/gfile/cmd.py index a845825..d882465 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -27,7 +27,7 @@ def main(): if args.action == Action.download: gf.download(args.output) else: - gf.upload() + gf.upload().get_download_page() if __name__ == "__main__": main() diff --git a/gfile/gfile.py b/gfile/gfile.py index b9b7b53..0c6bc17 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -1,14 +1,16 @@ import concurrent.futures +import io +import math import re +import time import uuid +from datetime import datetime from math import ceil from pathlib import Path -import io +from urllib.parse import unquote + from requests_toolbelt import MultipartEncoder, StreamingIterator from tqdm import tqdm -import time -from urllib.parse import unquote -import math def bytes_to_size_str(bytes): @@ -23,11 +25,10 @@ def bytes_to_size_str(bytes): def size_str_to_bytes(size_str): if isinstance(size_str, int): return size_str - m = re.search(r'^(?P\d+) ?(?P[KMGTPEZY]i?B)?$', size_str, re.IGNORECASE) + m = re.search(r'^(?P\d+) ?((?P[KMGTPEZY]?)(iB|B)?)$', size_str, re.IGNORECASE) assert m - units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - unit = m['unit'] or 'B' - unit = unit.upper().replace('I', '') + units = ("B", "K", "M", "G", "T", "P", "E", "Z", "Y") + unit = (m['unit'] or 'B').upper() return int(math.pow(1024, units.index(unit)) * int(m['num'])) @@ -168,7 +169,7 @@ def upload(self): assert Path(self.uri).exists() size = Path(self.uri).stat().st_size chunks = ceil(size / self.chunk_size) - print(f'Total chunks: {chunks}') + print(f'Filesize {bytes_to_size_str(size)}, chunk size: {bytes_to_size_str(self.chunk_size)}, total chunks: {chunks}') if self.progress: self.pbar = tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024, ncols=100) @@ -202,16 +203,17 @@ def upload(self): if self.pbar: self.pbar.close() if 'url' not in self.data: - print('Something went wrong', self.data) - else: - self.get_download_page() + print('Something went wrong. Upload failed.', self.data) + return self # for chain def get_download_page(self): - from datetime import datetime + if not self.data or not 'url' in self.data: + return f = Path(self.uri) print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, filename: {f.name}, size: {bytes_to_size_str(f.stat().st_size)}") print(self.data['url']) + return self.data['url'] def download(self, filename=None): From 6fa66cb6550c33885595e9df1cbab26fd0a18a30 Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 14 Feb 2023 14:55:32 +0800 Subject: [PATCH 10/17] Update readme --- README.md | 75 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index b12186e..2e19b89 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,55 @@ A python module to download and upload from [gigafile](https://gigafile.nu/). -This is a personal tool as you may come across errors and bugs so feel free to add an issue. -# Install +A major update from [the original](https://github.com/Sraq-Zit/gfile). Highlights: + +* Fixed multi-thread uploading (and made sure each threads finish in order so the final file is not broken) +* Fixed download filename issue +* Some refactoring and QoL changes. +# Install $ python setup.py install --user - or - $ pip install git+https://github.com/Sraq-Zit/gfile.git +or -# Usage + $ pip install git+https://github.com/fireattack/gfile.git +# Usage ## Module ### Import - from gfile import GFile +```py +from gfile import GFile +``` ### Download - url, cookies = GFile('https://XX.gigafile.nu/YYY').get_download() - # or - filename = GFile('https://XX.gigafile.nu/YYY').download() +```py +filename = GFile('https://XX.gigafile.nu/YYY').download() +``` + ### Upload - url = GFile('path/to/file', progress=True).upload().get_download_page() +```py +url = GFile('path/to/file', progress=True).upload().get_download_page() +``` ## CLI - - $ gfile upload path/to/file - - $ gfile download https://66.gigafile.nu/0320-b36ec21d4a56b143537e12df7388a5367 - - $ gfile -h - usage: Gfile [-h] [-p] [-n THREAD_NUM] [-s CHUNK_SIZE] [-m CHUNK_COPY_SIZE] {download,upload} uri - - positional arguments: - {download,upload} Upload or download - uri Filename to upload or url to download - - optional arguments: - -h, --help show this help message and exit - -p, --hide-progress Hide progress bar - -n THREAD_NUM, --thread-num THREAD_NUM - Number of threads used for upload (can incease speed) - -s CHUNK_SIZE, --chunk-size CHUNK_SIZE - allowed chunk size per upload - -m CHUNK_COPY_SIZE, --copy-size CHUNK_COPY_SIZE - Specifies size to copy the main file into pieces (the size loaded in RAM) - +```bash +$ gfile upload path/to/file + +$ gfile download https://66.gigafile.nu/0320-b36ec21d4a56b143537e12df7388a5367 + +$ gfile -h +usage: Gfile [-h] [-p] [-o OUTPUT] [-n THREAD_NUM] [-s CHUNK_SIZE] [-m CHUNK_COPY_SIZE] {download,upload} uri + +positional arguments: + {download,upload} upload or download + uri filename to upload or url to download + +options: + -h, --help show this help message and exit + -p, --hide-progress hide progress bar + -o OUTPUT, --output OUTPUT + output filename for download + -n THREAD_NUM, --thread-num THREAD_NUM + number of threads used for upload [default: 8] + -s CHUNK_SIZE, --chunk-size CHUNK_SIZE + chunk size per upload in bytes; note: chunk_size*thread will be loaded into memory [default: 100MB] + -m CHUNK_COPY_SIZE, --copy-size CHUNK_COPY_SIZE + specifies size to copy the main file into pieces [default: 1MB] +``` \ No newline at end of file From 1e3a211ccc6277a7a546a2e2f62e43bd5713506b Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 14 Feb 2023 16:12:17 +0800 Subject: [PATCH 11/17] misspelling --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1eb813a..34a3aa1 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ name='gfile', version='3.0', description='A python module to download and upload from gigafile.nu', - author='Sraqzit, firattack', + author='Sraqzit, fireattack', author_email='kingofmestry@gmail.com, (just use GitHub)', install_requires=['requests>=2.25.1', 'requests_toolbelt>=0.9.1', 'tqdm>=4.61.2'], requires=[], From d45fb674bc48102c3ad8bbfab2e4b3c7f326d93c Mon Sep 17 00:00:00 2001 From: fireattack Date: Fri, 24 Feb 2023 16:17:33 +0800 Subject: [PATCH 12/17] Better progress bar --- gfile/gfile.py | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/gfile/gfile.py b/gfile/gfile.py index 0c6bc17..4d9bb70 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -85,9 +85,8 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, c self.thread_num=thread_num self.progress = progress self.data = None - self.pbar: tqdm = None + self.pbar = None self.session = requests_retry_session() - self.index = 0 self.cookies = None self.current_chunk = 0 @@ -106,6 +105,8 @@ def upload_chunk(self, chunk_no, chunks): # prev = current # memo('Before load') + + bar = self.pbar[chunk_no % self.thread_num] if self.pbar else None with io.BytesIO() as f: split_file(self.uri, f, self.chunk_size, start=chunk_no * self.chunk_size, chunk_copy_size=self.chunk_copy_size) chunk_size = f.tell() @@ -127,13 +128,21 @@ def upload_chunk(self, chunk_no, chunks): del form_data size = len(form_data_binary) + if bar: + bar.desc = f'chunk {chunk_no + 1}/{chunks}' + bar.reset(total=size) + # bar.refresh() def gen(): offset = 0 while True: if offset < size: - yield form_data_binary[offset:offset+1024] - offset += 1024 + update_tick = 1024 * 128 + yield form_data_binary[offset:offset+update_tick] + if bar: + bar.update(min(update_tick, size - offset)) + bar.refresh() + offset += update_tick else: if chunk_no != self.current_chunk: time.sleep(0.01) @@ -147,9 +156,6 @@ def gen(): # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) self.current_chunk += 1 - if self.pbar: - self.pbar.desc = f'Finished {chunk_no + 1}/{chunks} chunks' - self.pbar.update(chunk_size) # print("Session gfsid after uploading:", self.session.cookies['gfsid']) # print('resp', resp.cookies.__dict__) resp_data = resp.json() @@ -165,24 +171,24 @@ def upload(self): self.token = uuid.uuid1().hex self.pbar = None self.failed = False - self.index = 0 assert Path(self.uri).exists() size = Path(self.uri).stat().st_size chunks = ceil(size / self.chunk_size) print(f'Filesize {bytes_to_size_str(size)}, chunk size: {bytes_to_size_str(self.chunk_size)}, total chunks: {chunks}') if self.progress: - self.pbar = tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024, ncols=100) - + self.pbar = [] + for i in range(self.thread_num): + self.pbar.append(tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024, ncols=100, position=i)) self.session = requests_retry_session() self.server = re.search(r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] - # upload the first chunk + # upload the first chunk to set cookies properly. self.upload_chunk(0, chunks) # upload second to second last chunk(s) with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread_num) as ex: - futures = {ex.submit(self.upload_chunk, i, chunks): i for i in range(1, chunks - 1)} + futures = {ex.submit(self.upload_chunk, i, chunks): i for i in range(1, chunks)} try: for future in concurrent.futures.as_completed(futures): if self.failed: @@ -197,11 +203,14 @@ def upload(self): return # upload last chunk if not already - if chunks > 1: - # print('\nupload the last chunk in single thread') - self.upload_chunk(chunks - 1, chunks) + # if chunks > 1: + # # print('\nupload the last chunk in single thread') + # self.upload_chunk(chunks - 1, chunks) - if self.pbar: self.pbar.close() + if self.pbar: + for bar in self.pbar: + bar.close() + print('') if 'url' not in self.data: print('Something went wrong. Upload failed.', self.data) return self # for chain @@ -228,14 +237,15 @@ def download(self, filename=None): r.raise_for_status() filesize = int(r.headers['Content-Length']) if not filename: + filename = 'gigafile_noname.bin' # temp name content_disp = r.headers['Content-Disposition'] if "UTF-8''" in content_disp: filename = unquote(content_disp.split("UTF-8''")[-1]) else: filename = re.search(r'filename="(.+?)";', content_disp)[1].encode('iso8859-1','ignore').decode('utf-8', 'ignore') - filename = re.sub(r'[\\/:*?"<>|]', '_', filename) # only sanitize remote filename. User provided are on users' own. + filename = re.sub(r'[\\/:*?"<>|]', '_', filename) # only sanitize remote filename. User provided ones are on users' own. if self.progress: - self.pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename, ncols=100) + self.pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename) with open(filename, 'wb') as f: for chunk in r.iter_content(chunk_size=self.chunk_copy_size): f.write(chunk) From 5e9232ce2df54a0a729117ec9efb340d7226589d Mon Sep 17 00:00:00 2001 From: fireattack Date: Fri, 24 Feb 2023 17:21:43 +0800 Subject: [PATCH 13/17] Add timeout --- .gitignore | 1 + gfile/cmd.py | 1 + gfile/gfile.py | 49 ++++++++++++++++++++----------------------------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index a0e97ea..ac4fb5a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ .vscode/ test* +note.txt # C extensions *.so diff --git a/gfile/cmd.py b/gfile/cmd.py index d882465..9471087 100644 --- a/gfile/cmd.py +++ b/gfile/cmd.py @@ -20,6 +20,7 @@ def main(): parser.add_argument('-n', '--thread-num', dest='thread_num', default=8, type=int, help='number of threads used for upload [default: 8]') parser.add_argument('-s', '--chunk-size', dest='chunk_size', default="100MB", help='chunk size per upload in bytes; note: chunk_size*thread will be loaded into memory [default: 100MB]') parser.add_argument('-m', '--copy-size', dest='chunk_copy_size', default="1MB", help='specifies size to copy the main file into pieces [default: 1MB]') + parser.add_argument('-t', '--timeout', type=int, default=10, help='specifies timeout time (in seconds) [default: 10]') args = parser.parse_args() diff --git a/gfile/gfile.py b/gfile/gfile.py index 4d9bb70..b2d9bf8 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -1,4 +1,5 @@ import concurrent.futures +import functools import io import math import re @@ -9,8 +10,11 @@ from pathlib import Path from urllib.parse import unquote +import requests +from requests.adapters import HTTPAdapter from requests_toolbelt import MultipartEncoder, StreamingIterator from tqdm import tqdm +from urllib3.util.retry import Retry def bytes_to_size_str(bytes): @@ -38,10 +42,6 @@ def requests_retry_session( status_forcelist=None, # (500, 502, 504) session=None, ): - import requests - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - session = session or requests.Session() retry = Retry( total=retries, @@ -77,8 +77,9 @@ def split_file(input_file, out, target_size=None, start=0, chunk_copy_size=1024* size += len(chunk) out.write(chunk) + class GFile: - def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, chunk_copy_size=1024*1024, **kwargs) -> None: + def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, chunk_copy_size=1024*1024, timeout=10, **kwargs) -> None: self.uri = uri self.chunk_size = size_str_to_bytes(chunk_size) self.chunk_copy_size = size_str_to_bytes(chunk_copy_size) @@ -86,26 +87,14 @@ def __init__(self, uri, progress=False, thread_num=4, chunk_size=1024*1024*10, c self.progress = progress self.data = None self.pbar = None + self.timeout = timeout self.session = requests_retry_session() + self.session.request = functools.partial(self.session.request, timeout=self.timeout) self.cookies = None self.current_chunk = 0 def upload_chunk(self, chunk_no, chunks): - # import tracemalloc - - # tracemalloc.start() - # prev = tracemalloc.get_traced_memory()[0] - - # def memo(text=''): - # nonlocal prev - - # current = tracemalloc.get_traced_memory()[0] - # print(f'Memory change at {text}', current - prev) - # prev = current - - # memo('Before load') - bar = self.pbar[chunk_no % self.thread_num] if self.pbar else None with io.BytesIO() as f: split_file(self.uri, f, self.chunk_size, start=chunk_no * self.chunk_size, chunk_copy_size=self.chunk_copy_size) @@ -149,17 +138,19 @@ def gen(): else: time.sleep(0.1) break + while True: + try: + streamer = StreamingIterator(size, gen()) + resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) + except Exception as ex: + print(ex) + print('Retrying...') + else: + break - streamer = StreamingIterator(size, gen()) - - # print("Session gfsid:", self.session.cookies['gfsid']) - # print(f'Updating chunk {chunk_no + 1} out of {chunks} chunks') - resp = self.session.post(f"https://{self.server}/upload_chunk.php", data=streamer, headers=headers) - self.current_chunk += 1 - # print("Session gfsid after uploading:", self.session.cookies['gfsid']) - # print('resp', resp.cookies.__dict__) resp_data = resp.json() - # print(resp_data) + self.current_chunk += 1 + if 'url' in resp_data: self.data = resp_data if 'status' not in resp_data or resp_data['status']: @@ -180,7 +171,7 @@ def upload(self): self.pbar = [] for i in range(self.thread_num): self.pbar.append(tqdm(total=size, unit="B", unit_scale=True, leave=False, unit_divisor=1024, ncols=100, position=i)) - self.session = requests_retry_session() + self.server = re.search(r'var server = "(.+?)"', self.session.get('https://gigafile.nu/').text)[1] # upload the first chunk to set cookies properly. From 0493295b73d5689c9e403e847136dd8303efd766 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sun, 26 Feb 2023 22:07:50 +0800 Subject: [PATCH 14/17] make desc shorter for download --- gfile/gfile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gfile/gfile.py b/gfile/gfile.py index b2d9bf8..96095eb 100644 --- a/gfile/gfile.py +++ b/gfile/gfile.py @@ -236,7 +236,8 @@ def download(self, filename=None): filename = re.search(r'filename="(.+?)";', content_disp)[1].encode('iso8859-1','ignore').decode('utf-8', 'ignore') filename = re.sub(r'[\\/:*?"<>|]', '_', filename) # only sanitize remote filename. User provided ones are on users' own. if self.progress: - self.pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=filename) + desc = filename if len(filename) <= 20 else filename[0:11] + '..' + filename[-7:] + self.pbar = tqdm(total=filesize, unit='B', unit_scale=True, unit_divisor=1024, desc=desc) with open(filename, 'wb') as f: for chunk in r.iter_content(chunk_size=self.chunk_copy_size): f.write(chunk) From 94bbd6cb94f00ba36c40e57b70196bfa28105142 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sun, 26 Feb 2023 22:09:08 +0800 Subject: [PATCH 15/17] Update version --- gfile/__init__.py | 5 ++--- setup.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/gfile/__init__.py b/gfile/__init__.py index ad8b44e..c17fd3f 100644 --- a/gfile/__init__.py +++ b/gfile/__init__.py @@ -1,5 +1,4 @@ from .gfile import GFile -__author__ = """SraqZit""" -__email__ = 'sraqzit@gmail.com' -__version__ = '2.1' \ No newline at end of file +__author__ = """Sraqzit, fireattack""" +__version__ = '3.1' \ No newline at end of file diff --git a/setup.py b/setup.py index 34a3aa1..b5984c7 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,9 @@ setup( name='gfile', - version='3.0', + version='3.1', description='A python module to download and upload from gigafile.nu', author='Sraqzit, fireattack', - author_email='kingofmestry@gmail.com, (just use GitHub)', install_requires=['requests>=2.25.1', 'requests_toolbelt>=0.9.1', 'tqdm>=4.61.2'], requires=[], packages=['gfile'], From 68f7b6286745520409c5e472578ded0b6c9ab916 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sat, 11 Mar 2023 10:16:40 +0800 Subject: [PATCH 16/17] Update README --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 2e19b89..01adcb2 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,5 @@ A python module to download and upload from [gigafile](https://gigafile.nu/). -A major update from [the original](https://github.com/Sraq-Zit/gfile). Highlights: - -* Fixed multi-thread uploading (and made sure each threads finish in order so the final file is not broken) -* Fixed download filename issue -* Some refactoring and QoL changes. - # Install $ python setup.py install --user or From 6d6784198aa1806cebf7931e3e6b093276f75978 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sat, 11 Mar 2023 10:32:36 +0800 Subject: [PATCH 17/17] Revert git URL to upstream --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 01adcb2..1c5af94 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A python module to download and upload from [gigafile](https://gigafile.nu/). $ python setup.py install --user or - $ pip install git+https://github.com/fireattack/gfile.git + $ pip install git+https://github.com/Sraq-Zit/gfile.git -U # Usage ## Module