From 85d92dcaa944664e16087aa496598bcbf95f8017 Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 07:23:04 -0800 Subject: [PATCH 1/7] Initial commit - initial code for file uploading. --- azhdankin/README.md | 29 ++++++++++++++++++++++++ azhdankin/cloud_storage.py | 34 ++++++++++++++++++++++++++++ azhdankin/upload_files.py | 46 ++++++++++++++++++++++++++++++++++++++ azhdankin/uploader.py | 16 +++++++++++++ 4 files changed, 125 insertions(+) create mode 100644 azhdankin/README.md create mode 100644 azhdankin/cloud_storage.py create mode 100644 azhdankin/upload_files.py create mode 100644 azhdankin/uploader.py diff --git a/azhdankin/README.md b/azhdankin/README.md new file mode 100644 index 00000000..53dae762 --- /dev/null +++ b/azhdankin/README.md @@ -0,0 +1,29 @@ +# Multi-threaded file uploader (Backend) + +> Ideal candidate: skilled python developer with solid knowledge of cloud and distributed systems. + +# Overview + +Create a python application that uploads a set of given files to a cloud object storage in parallel through the cloud provider's or third party API. + +# Requirements + +1. Support up to 100,000nds of files, all inside one directory with arbitrary sizes. The root directory may contain subdirectories. +1. The object storage container which holds the objects is private and only credential-based access is allowed. +1. Each object inside object storage should have an associated metadata which contains file size, last modification time and file permissions. + +# Expectations + +- Fast (utilize full network bandwidth), low CPU (do not block all other processes) and low Memory (<25% tentatively) file uploader +- Support for AWS S3 +- Modular and Object oriented implementation (to add other cloud providers) +- Clean and documented code +- Tests + +# Timeline + +We leave exact timing to the candidate. Must fit Within 5 days total. + +# Notes + +- we can provide temporary credentials to access AWS/Azure. diff --git a/azhdankin/cloud_storage.py b/azhdankin/cloud_storage.py new file mode 100644 index 00000000..d4a8ee28 --- /dev/null +++ b/azhdankin/cloud_storage.py @@ -0,0 +1,34 @@ +from google.cloud import storage + +class CloudStorage: + def __init__(self, name): + self.name = name + + def upload_object(self, object_name, source_file_name): + pass + + def __str__(self): + return self.name + +class CloudStorageGCP(CloudStorage): + def __init__(self, bucket_name, project=None): + super().__init__("CloudStorageGCP") + self.project = project + self.client = storage.Client(project=self.project) + buckets = list(self.client.list_buckets()) + bucket_is_found = False + for bucket in buckets: + if bucket.name == bucket_name: + self.bucket = self.client.bucket(bucket_name) + bucket_is_found = True + break + if not bucket_is_found: + self.bucket = self.client.create_bucket(bucket_name) + + def upload_object(self, object_name, source_file_name): + # Create a new blob object + blob = self.bucket.blob(object_name) + # Upload the file to the bucket + blob.upload_from_filename(source_file_name) + + diff --git a/azhdankin/upload_files.py b/azhdankin/upload_files.py new file mode 100644 index 00000000..e570ce22 --- /dev/null +++ b/azhdankin/upload_files.py @@ -0,0 +1,46 @@ +import os +import glob +from concurrent.futures import ThreadPoolExecutor + +from cloud_storage import CloudStorageGCP +from uploader import FileUploader + + +path = "./files/*" + +file_list = [] + +for entry in glob.iglob(path, recursive=True): + if os.path.isfile(entry): + file_list.append(entry) + else: + entry = entry + "/*" + for element in glob.iglob(entry, recursive=True): + if os.path.isfile(element): + file_list.append(element) + +MAX_UPLOAD_WORKERS = 2 + +file_list_len = len(file_list) + +step = int(file_list_len/MAX_UPLOAD_WORKERS) +remainder = file_list_len%MAX_UPLOAD_WORKERS + +storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') + +pool = ThreadPoolExecutor (max_workers=MAX_UPLOAD_WORKERS) + +i=0 + +while i < (file_list_len - remainder): + uploader = FileUploader (storage, file_list, i, step) + pool.submit (uploader.run()) + i += step + +if remainder > 0: + uploader = FileUploader (storage, file_list, i, remainder) + pool.submit (uploader.run()) + +pool.shutdown (wait=True) + + diff --git a/azhdankin/uploader.py b/azhdankin/uploader.py new file mode 100644 index 00000000..17771a85 --- /dev/null +++ b/azhdankin/uploader.py @@ -0,0 +1,16 @@ +import os +from cloud_storage import CloudStorage + +class FileUploader: + def __init__(self, storage, files, start_idx, count): + self.storage = storage + self.files = files + self.start = start_idx + self.count = count + + def run (self): + for i in range(self.start, self.start + self.count): + object_name = os.path.split(self.files[i])[1] + self.storage.upload_object(object_name, self.files[i]) + + From fd1c644bf10a77fe3fd520163072a233e7da32d1 Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 10:35:37 -0800 Subject: [PATCH 2/7] Added file gen utility plus some optimization to the code creating the bucket. --- azhdankin/cloud_storage.py | 13 ++++--------- azhdankin/create_files.py | 27 +++++++++++++++++++++++++++ azhdankin/seed-file.txt | 3 +++ azhdankin/upload_files.py | 13 ++++++++++++- azhdankin/uploader.py | 2 +- 5 files changed, 47 insertions(+), 11 deletions(-) create mode 100644 azhdankin/create_files.py create mode 100644 azhdankin/seed-file.txt diff --git a/azhdankin/cloud_storage.py b/azhdankin/cloud_storage.py index d4a8ee28..c71142b8 100644 --- a/azhdankin/cloud_storage.py +++ b/azhdankin/cloud_storage.py @@ -14,16 +14,11 @@ class CloudStorageGCP(CloudStorage): def __init__(self, bucket_name, project=None): super().__init__("CloudStorageGCP") self.project = project + self.bucket_name = bucket_name self.client = storage.Client(project=self.project) - buckets = list(self.client.list_buckets()) - bucket_is_found = False - for bucket in buckets: - if bucket.name == bucket_name: - self.bucket = self.client.bucket(bucket_name) - bucket_is_found = True - break - if not bucket_is_found: - self.bucket = self.client.create_bucket(bucket_name) + self.bucket = self.client.bucket(self.bucket_name) + if not self.bucket.exists(): + self.bucket = self.client.create_bucket(self.bucket_name) def upload_object(self, object_name, source_file_name): # Create a new blob object diff --git a/azhdankin/create_files.py b/azhdankin/create_files.py new file mode 100644 index 00000000..dafed028 --- /dev/null +++ b/azhdankin/create_files.py @@ -0,0 +1,27 @@ +import sys +import os +import random + +path = "./files/" +name_prefix = "file-2-upload" + +num_files = 10 + +if len(sys.argv) > 1: + num_files = int(sys.argv[1]) + +seed_file = "./seed-file.txt" + +file=open(seed_file,"r") +seed_content = file.read() +target_file_content = "" + +for target_file_idx in range (0, num_files): + repeat = random.randint(1,100) + for chunk_num in range (0, repeat): + target_file_content = target_file_content + seed_content + target_file = open (path + name_prefix + str(target_file_idx) + ".txt", 'w') + target_file.write (target_file_content) + target_file_content = "" + + diff --git a/azhdankin/seed-file.txt b/azhdankin/seed-file.txt new file mode 100644 index 00000000..6c1513ee --- /dev/null +++ b/azhdankin/seed-file.txt @@ -0,0 +1,3 @@ +dsgsddhjdnkdhfkdfjhekfndsmcndkfhed +dhfdhfgefgejfhefhekfekfhekfekf +jhdfhejfgejfgejgfehgfehgfehgfhegf diff --git a/azhdankin/upload_files.py b/azhdankin/upload_files.py index e570ce22..f5626e58 100644 --- a/azhdankin/upload_files.py +++ b/azhdankin/upload_files.py @@ -1,5 +1,7 @@ import os import glob +import time + from concurrent.futures import ThreadPoolExecutor from cloud_storage import CloudStorageGCP @@ -19,7 +21,7 @@ if os.path.isfile(element): file_list.append(element) -MAX_UPLOAD_WORKERS = 2 +MAX_UPLOAD_WORKERS = 10 file_list_len = len(file_list) @@ -32,15 +34,24 @@ i=0 +time_start = time.time() + while i < (file_list_len - remainder): + #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, step) pool.submit (uploader.run()) i += step if remainder > 0: + #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, remainder) pool.submit (uploader.run()) pool.shutdown (wait=True) +time_end = time.time() +time_delta = time_end - time_start +print (time_delta) + + diff --git a/azhdankin/uploader.py b/azhdankin/uploader.py index 17771a85..d536782a 100644 --- a/azhdankin/uploader.py +++ b/azhdankin/uploader.py @@ -1,5 +1,5 @@ import os -from cloud_storage import CloudStorage +from cloud_storage import CloudStorageGCP class FileUploader: def __init__(self, storage, files, start_idx, count): From 5b8fc3a00ca0d7d6c762788427e19a66ad5e4c36 Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 10:59:28 -0800 Subject: [PATCH 3/7] Duh... --- azhdankin/upload_files.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/azhdankin/upload_files.py b/azhdankin/upload_files.py index f5626e58..43219c09 100644 --- a/azhdankin/upload_files.py +++ b/azhdankin/upload_files.py @@ -21,7 +21,7 @@ if os.path.isfile(element): file_list.append(element) -MAX_UPLOAD_WORKERS = 10 +MAX_UPLOAD_WORKERS = 50 file_list_len = len(file_list) @@ -37,15 +37,16 @@ time_start = time.time() while i < (file_list_len - remainder): + print(i) #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, step) - pool.submit (uploader.run()) + pool.submit (uploader.run) i += step if remainder > 0: #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, remainder) - pool.submit (uploader.run()) + pool.submit (uploader.run) pool.shutdown (wait=True) From 85e6a7bf7111e013c37dbe1b4f2292e6f82bf89d Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 18:02:14 -0800 Subject: [PATCH 4/7] Added documentation. --- azhdankin/cloud_storage.py | 12 ++++++++++++ azhdankin/create_files.py | 28 ++++++++++++++++++++++++++++ azhdankin/upload_files.py | 28 +++++++++++++++++++++------- azhdankin/uploader.py | 20 +++++++++++++++++++- 4 files changed, 80 insertions(+), 8 deletions(-) mode change 100644 => 100755 azhdankin/create_files.py mode change 100644 => 100755 azhdankin/upload_files.py diff --git a/azhdankin/cloud_storage.py b/azhdankin/cloud_storage.py index c71142b8..807a301b 100644 --- a/azhdankin/cloud_storage.py +++ b/azhdankin/cloud_storage.py @@ -1,5 +1,12 @@ +""" +This module contains the definitions for the CloudStorage base class which acta like an "abstract class" or +the equivalent of the "interface" for the Cloud storage. +It also has an implementation of the Cloud storage implememntation for GCP, which allows to upload the files. +""" + from google.cloud import storage +#Base CloudStorage class class CloudStorage: def __init__(self, name): self.name = name @@ -10,13 +17,18 @@ def upload_object(self, object_name, source_file_name): def __str__(self): return self.name +#Cloud storage implementation for GCP class CloudStorageGCP(CloudStorage): def __init__(self, bucket_name, project=None): super().__init__("CloudStorageGCP") self.project = project self.bucket_name = bucket_name self.client = storage.Client(project=self.project) + + #Resolve the reference to the destination bucket self.bucket = self.client.bucket(self.bucket_name) + + #If target bucket does not exist it will be created if not self.bucket.exists(): self.bucket = self.client.create_bucket(self.bucket_name) diff --git a/azhdankin/create_files.py b/azhdankin/create_files.py old mode 100644 new mode 100755 index dafed028..674b6582 --- a/azhdankin/create_files.py +++ b/azhdankin/create_files.py @@ -1,22 +1,50 @@ +#!python +""" File creation utility. + +This is a utility to crate the directory and sample files which +will be used for transfer to a cloud storage. + +The utility takes one command line parameter: number of files to create. +If the parameter is not given by default it will create 10 files. + +The file creation is performed by copying the content of the ./seed-file.txt +content n-times (where n is a randomly generated number in a range from 1 to a 100) +into a destination file and naming the destination file by appending sequentially +incremented number to the base file name. + +""" + + import sys import os import random +#Set the root of the files location and the name prefix for the files to be generated path = "./files/" name_prefix = "file-2-upload" +#Set the default number of files to be generated num_files = 10 +#Read the number of files to be generated from the cmd line if provided if len(sys.argv) > 1: num_files = int(sys.argv[1]) +#Specify the "seed" for the generated files' content. seed_file = "./seed-file.txt" +#Create the destination directory if it does not exist +if not os.path.exists(path): + os.makedirs(path) + +#Populate the seed string for the files to be created and initialize the content file=open(seed_file,"r") seed_content = file.read() target_file_content = "" +#Create the files for upload for target_file_idx in range (0, num_files): + #Replicate the seed content a random number of times repeat = random.randint(1,100) for chunk_num in range (0, repeat): target_file_content = target_file_content + seed_content diff --git a/azhdankin/upload_files.py b/azhdankin/upload_files.py old mode 100644 new mode 100755 index 43219c09..74492586 --- a/azhdankin/upload_files.py +++ b/azhdankin/upload_files.py @@ -1,3 +1,11 @@ +#!python +""" Main module to run to upload the files to the Cloud storage. + +This program establishes the connection to the Cloud Storage (GCP in this case), +reads the names of the files available for upload +and performs parallel upload of the files to the specified Cloud Storage bucket. + +""" import os import glob import time @@ -7,11 +15,14 @@ from cloud_storage import CloudStorageGCP from uploader import FileUploader - +#Path to the root directory where the files to be uploaded are located path = "./files/*" +#Initialize the file names list file_list = [] +#Populate the list of the file names set for upload. Currently we support two levels of the directories +#where files are located for entry in glob.iglob(path, recursive=True): if os.path.isfile(entry): file_list.append(entry) @@ -21,30 +32,33 @@ if os.path.isfile(element): file_list.append(element) -MAX_UPLOAD_WORKERS = 50 +#Specify the maximum number of the workers that perform files upload simultaneously +MAX_UPLOAD_WORKERS = 100 -file_list_len = len(file_list) +#Calculate the partitioning of the file names list - each partition or chunk will be assigned +#to a single upload worker +file_list_len = len(file_list) step = int(file_list_len/MAX_UPLOAD_WORKERS) remainder = file_list_len%MAX_UPLOAD_WORKERS +#Initialize a Cloud Storage Provider storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') +#Create the Thread Pool which will be used to run the uploader tasks pool = ThreadPoolExecutor (max_workers=MAX_UPLOAD_WORKERS) +#Schedule the upload tasks i=0 - time_start = time.time() while i < (file_list_len - remainder): print(i) - #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, step) pool.submit (uploader.run) i += step if remainder > 0: - #storage = CloudStorageGCP("azhdanki-test-bucket1", project='rewotes') uploader = FileUploader (storage, file_list, i, remainder) pool.submit (uploader.run) @@ -52,7 +66,7 @@ time_end = time.time() time_delta = time_end - time_start -print (time_delta) +print ("It took " + str(time_delta) + " seconds to upload " + str(file_list_len) + " files.") diff --git a/azhdankin/uploader.py b/azhdankin/uploader.py index d536782a..7347fbb0 100644 --- a/azhdankin/uploader.py +++ b/azhdankin/uploader.py @@ -1,13 +1,31 @@ +""" +Class performing upload of the files to the cloud storage. +""" + import os -from cloud_storage import CloudStorageGCP class FileUploader: + """ + Constructor is taking the following parameters: + storage - a reference to the instance of CloudStorage object. + a CloudStorage class is a parent class for the cloud storage + provider specific implementations, i.e. CloudStorageGCP, CloudStrageAWS + + files - a reference to the entire list of the file names that need to be uploaded + + start_idx - an index, a "pointer" to the files list specifying where this file uploader + will start + + count - a number which specifies how many files this instance of uploader has to process (upload) + + """ def __init__(self, storage, files, start_idx, count): self.storage = storage self.files = files self.start = start_idx self.count = count + #The method performing the upload of the group of the files to the Cloud storage def run (self): for i in range(self.start, self.start + self.count): object_name = os.path.split(self.files[i])[1] From eefd6025de02d474cbec1f863ae4c71a1d55fbb1 Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 18:18:54 -0800 Subject: [PATCH 5/7] Updated the README file. --- azhdankin/README.md | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/azhdankin/README.md b/azhdankin/README.md index 53dae762..07d5c320 100644 --- a/azhdankin/README.md +++ b/azhdankin/README.md @@ -1,29 +1,25 @@ -# Multi-threaded file uploader (Backend) - -> Ideal candidate: skilled python developer with solid knowledge of cloud and distributed systems. +# Multi-threaded file uploader # Overview -Create a python application that uploads a set of given files to a cloud object storage in parallel through the cloud provider's or third party API. +This is a python application that uploads a set of given files to a cloud object storage in parallel through the cloud provider's or third party API. -# Requirements +# Features 1. Support up to 100,000nds of files, all inside one directory with arbitrary sizes. The root directory may contain subdirectories. -1. The object storage container which holds the objects is private and only credential-based access is allowed. -1. Each object inside object storage should have an associated metadata which contains file size, last modification time and file permissions. - -# Expectations +2. The object storage container holds the objects is private and only credential-based access is allowed. +3. Each object inside object storage has an associated metadata which contains file size, last modification time and file permissions. -- Fast (utilize full network bandwidth), low CPU (do not block all other processes) and low Memory (<25% tentatively) file uploader -- Support for AWS S3 -- Modular and Object oriented implementation (to add other cloud providers) -- Clean and documented code -- Tests + The utility is fast (utilizes full network bandwidth), consumes low CPU (low enough not block all other processes) and low Memory (<25%) + It supports GCP Cloud Storage, however it has a modular and Object oriented implementation so the other cloud providers can be added. -# Timeline +# Prerequisites + You must have Python 3.8 and the Google Cloud Storage Python client installed. -We leave exact timing to the candidate. Must fit Within 5 days total. +# To run + 1. Clone the git repository. Make sure the create_files.py and upload_files.py file permissions are set to "executable". + 2. Run ./create_files.py utility. This utility will create the files that need to be uploaded to the cloud storage. You can set the + number of file sto be created as a cmd line parameter (i.e. ./create_files.py 10000 to create 10000 files). + 3. Run ./upload_files.py to upload the files to the Cloud Storage. -# Notes -- we can provide temporary credentials to access AWS/Azure. From 45569b37f21a189c526838bea5e0b5efd457e37c Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 18:22:02 -0800 Subject: [PATCH 6/7] Updated the README file. --- azhdankin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azhdankin/README.md b/azhdankin/README.md index 07d5c320..834c3368 100644 --- a/azhdankin/README.md +++ b/azhdankin/README.md @@ -22,4 +22,4 @@ This is a python application that uploads a set of given files to a cloud object number of file sto be created as a cmd line parameter (i.e. ./create_files.py 10000 to create 10000 files). 3. Run ./upload_files.py to upload the files to the Cloud Storage. - +# Notes From 7b1e52b6c2bf193478bfbf3291d1ea1aaa4b1d4d Mon Sep 17 00:00:00 2001 From: Aleksandr Zhdankin Date: Tue, 21 Nov 2023 20:16:34 -0800 Subject: [PATCH 7/7] Added samples of the test code. --- azhdankin/create_files.py | 52 ++++++++++++++++++---------------- azhdankin/test_create_files.py | 40 ++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 25 deletions(-) create mode 100755 azhdankin/test_create_files.py diff --git a/azhdankin/create_files.py b/azhdankin/create_files.py index 674b6582..64d23e9a 100755 --- a/azhdankin/create_files.py +++ b/azhdankin/create_files.py @@ -13,15 +13,38 @@ incremented number to the base file name. """ - - import sys import os import random +#Performs the file creation +def create_files (path, num_files): + #Specify the "seed" for the generated files' content. + seed_file = "./seed-file.txt" + name_prefix = "file-2-upload" + + #Create the destination directory if it does not exist + if not os.path.exists(path): + os.makedirs(path) + + #Populate the seed string for the files to be created and initialize the content + file=open(seed_file,"r") + seed_content = file.read() + target_file_content = "" + + #Create the files for upload + for target_file_idx in range (0, num_files): + #Replicate the seed content a random number of times + repeat = random.randint(1,100) + for chunk_num in range (0, repeat): + target_file_content = target_file_content + seed_content + target_file = open (path + name_prefix + str(target_file_idx) + ".txt", 'w') + target_file.write (target_file_content) + target_file_content = "" + + #Set the root of the files location and the name prefix for the files to be generated path = "./files/" -name_prefix = "file-2-upload" #Set the default number of files to be generated num_files = 10 @@ -30,26 +53,5 @@ if len(sys.argv) > 1: num_files = int(sys.argv[1]) -#Specify the "seed" for the generated files' content. -seed_file = "./seed-file.txt" - -#Create the destination directory if it does not exist -if not os.path.exists(path): - os.makedirs(path) - -#Populate the seed string for the files to be created and initialize the content -file=open(seed_file,"r") -seed_content = file.read() -target_file_content = "" - -#Create the files for upload -for target_file_idx in range (0, num_files): - #Replicate the seed content a random number of times - repeat = random.randint(1,100) - for chunk_num in range (0, repeat): - target_file_content = target_file_content + seed_content - target_file = open (path + name_prefix + str(target_file_idx) + ".txt", 'w') - target_file.write (target_file_content) - target_file_content = "" - +create_files(path, num_files) diff --git a/azhdankin/test_create_files.py b/azhdankin/test_create_files.py new file mode 100755 index 00000000..3b8da6a5 --- /dev/null +++ b/azhdankin/test_create_files.py @@ -0,0 +1,40 @@ +#Test for file creation utility. +import glob +import os +from create_files import create_files + +#Perform test of file creation +def test_create_files(): + + #Set the root of the files location and the name prefix for the files to be generated + path = "./test_files/" + + #Set the default number of files to be generated + num_files = 10 + + create_files(path, num_files) + + #Path to the root directory where the created files are located + path = "./test_files/*" + + #Initialize the file names list + file_list = [] + + #Populate the list of the file names that were generated. Currently we support two levels of the directories + #where files are located + for entry in glob.iglob(path, recursive=True): + if os.path.isfile(entry): + file_list.append(entry) + else: + entry = entry + "/*" + for element in glob.iglob(entry, recursive=True): + if os.path.isfile(element): + file_list.append(element) + + file_list_len = len(file_list) + assert file_list_len == 10 + + + + +