diff --git a/meta_extractor/classifier/classifiers.py b/meta_extractor/classifier/classifiers.py index ef8242b8..8fe5dbc0 100644 --- a/meta_extractor/classifier/classifiers.py +++ b/meta_extractor/classifier/classifiers.py @@ -1,32 +1,34 @@ -from .file_utils import (mime_type, has_csharp_description, pe_mime_types, flash_mime_types, apk_mime_types, - java_mime_types, zip_and_jar_shared_mime_types, has_java_structure, maybe_zip_mime_types, - has_zip_structure) +from .file_utils import (mime_type, has_csharp_description, pe_mime_types, flash_mime_types, java_mime_types, + has_java_structure, zip_mime_types, has_zip_structure) -def pe_classifier(data) -> bool: - return mime_type(data) in pe_mime_types and has_csharp_description(data) +class CSharpClassifier: + file_type = 'pe' + def match(self, data: bytes) -> bool: + return mime_type(data) in pe_mime_types and has_csharp_description(data) -def flash_classifier(data) -> bool: - return mime_type(data) in flash_mime_types +class FlashClassifier: + file_type = 'flash' -def apk_classifier(data) -> bool: - return mime_type(data) in apk_mime_types + def match(self, data: bytes) -> bool: + return mime_type(data) in flash_mime_types -def java_classifier(data) -> bool: - return mime_type(data) in java_mime_types or (mime_type(data) in zip_and_jar_shared_mime_types - and has_java_structure(data)) +class JavaClassifier: + file_type = 'java' + def match(self, data: bytes) -> bool: + return mime_type(data) in java_mime_types or (mime_type(data) in zip_mime_types and has_java_structure(data)) -def zip_classifier(data) -> bool: - return ((mime_type(data) in maybe_zip_mime_types and has_zip_structure(data)) - or (mime_type(data) in zip_and_jar_shared_mime_types and not java_classifier(data))) +class ZipClassifier: + file_type = 'zip' -CLASSIFIERS = {"pe": pe_classifier, - "flash": flash_classifier, - "apk": apk_classifier, - "java": java_classifier, - "zip": zip_classifier} + def match(self, data: bytes) -> bool: + return mime_type(data) in zip_mime_types and has_zip_structure(data) and not has_java_structure(data) + + +# Order is important! For instance, if you put zip classifier first, java files will be detected as zips. +CLASSIFIERS = [CSharpClassifier(), FlashClassifier(), JavaClassifier(), ZipClassifier()] diff --git a/meta_extractor/classifier/classify.py b/meta_extractor/classifier/classify.py index 0e422be0..d36c6c04 100644 --- a/meta_extractor/classifier/classify.py +++ b/meta_extractor/classifier/classify.py @@ -3,11 +3,11 @@ from .classifiers import CLASSIFIERS -def get_identifier_of_file(sample: bytes) -> str: +def get_identifier_of_file(sample_content: bytes) -> str: sample_identifier = None - for identifier, classifier_function in CLASSIFIERS.items(): - if classifier_function(sample): - sample_identifier = identifier + for classifier in CLASSIFIERS: + if classifier.match(sample_content): + sample_identifier = classifier.file_type break logging.info(f'File type detected: {sample_identifier}') return sample_identifier diff --git a/meta_extractor/classifier/file_utils.py b/meta_extractor/classifier/file_utils.py index 486c02e5..c24d1e7f 100644 --- a/meta_extractor/classifier/file_utils.py +++ b/meta_extractor/classifier/file_utils.py @@ -14,14 +14,9 @@ 'application/vnd.adobe.flash.movie'] -apk_mime_types = ['application/vnd.android.package-archive'] - - -zip_and_jar_shared_mime_types = ['application/zip', - 'application/x-zip-compressed', - 'multipart/x-zip'] - -maybe_zip_mime_types = ['application/octet-stream'] +zip_mime_types = ['application/zip', + 'application/x-zip-compressed', + 'multipart/x-zip'] java_mime_types = ['application/x-java-archive', diff --git a/meta_extractor/redis/requeue.py b/meta_extractor/redis/queue.py similarity index 52% rename from meta_extractor/redis/requeue.py rename to meta_extractor/redis/queue.py index 8a798ebb..183ffe09 100644 --- a/meta_extractor/redis/requeue.py +++ b/meta_extractor/redis/queue.py @@ -1,11 +1,12 @@ import pickle pickle.HIGHEST_PROTOCOL = 4 from rq import Queue +from rq.queue import Job from redis import Redis import logging -class TaskRequeuer: +class TaskQueue: def __init__(self): self.connection = Redis(host='daas_redis_task_queue_1') # Where to look for decompilers' code @@ -15,6 +16,16 @@ def __init__(self): connection=self.connection, default_timeout=1200) + def _enqueue(self, task_arguments: dict) -> Job: + return self.queue.enqueue(self.worker_path, args=(task_arguments,)) + def requeue(self, task_arguments: dict) -> None: - task = self.queue.enqueue(self.worker_path, args=(task_arguments,)) + task = self._enqueue(task_arguments) logging.info(f'Service is unavailable for the given url. Task requeued. {task_arguments["external_url"]=}, {task.id=}') + + def add_subfile_to_queue(self, old_task_arguments: dict, seaweedfs_file_id: str) -> None: + new_task_parameters = old_task_arguments.copy() + new_task_parameters['external_url'] = None + new_task_parameters['seaweedfs_file_id'] = seaweedfs_file_id + task = self._enqueue(new_task_parameters) + logging.info(f'Subfile of a zip file added to queue as new file with {task.id=}') diff --git a/meta_extractor/redis/task.py b/meta_extractor/redis/task.py index 8f6cd799..7bf6779d 100644 --- a/meta_extractor/redis/task.py +++ b/meta_extractor/redis/task.py @@ -4,7 +4,7 @@ import logging from ..seaweed import seaweedfs -from .requeue import TaskRequeuer +from .queue import TaskQueue from ..sample import Sample @@ -34,7 +34,7 @@ def _get_sample_content(self, file_name: str) -> Tuple[Optional[bytes], str]: if content := self._get_sample_content_from_external_url(self.external_url): # Sample downloaded self.seaweedfs_file_id = seaweedfs.upload_file(stream=content, name=file_name) else: # Sample not accessible at the moment through the given URL - TaskRequeuer().requeue(self.settings) + TaskQueue().requeue(self.settings) raise Exception('Sample not downloadable. Task requeued with low priority.') else: logging.error('Tasks should have one the following parameters: seaweedfs_file_id or external_url.\n' + @@ -54,3 +54,8 @@ def _get_sample_content_from_external_url(self, download_url: str) -> Optional[b @property def sample_found(self) -> bool: return self.sample is not None + + def split_into_subtasks_per_subfile(self) -> None: + for subfile in self.sample.subfiles: + subfile_seaweedfs_file_id = seaweedfs.upload_file(stream=subfile.content, name=subfile.file_name) + TaskQueue().add_subfile_to_queue(self.settings, subfile_seaweedfs_file_id) diff --git a/meta_extractor/sample.py b/meta_extractor/sample.py index 12d33d64..7556cb5e 100644 --- a/meta_extractor/sample.py +++ b/meta_extractor/sample.py @@ -4,6 +4,7 @@ from .seaweed import seaweedfs from .classifier.classify import get_identifier_of_file from .classifier.file_utils import get_in_memory_zip_of +from .redis.queue import TaskQueue class Sample: @@ -16,7 +17,7 @@ def __init__(self, file_name: str, content: bytes, password: bytes, self.uploaded_on = uploaded_on self.file_type = get_identifier_of_file(self.content) self.subfiles = [] - if self.file_type == 'zip': + if self.is_zip: self._load_subfiles() def _load_subfiles(self) -> None: @@ -62,6 +63,10 @@ def metadata(self) -> dict: 'file_name': self.file_name, 'subfiles': [subfile.metadata for subfile in self.subfiles]} + @property + def is_zip(self): + return self.file_type == 'zip' + def delete_from_seaweedfs(self) -> None: if self.seaweedfs_file_id: seaweedfs.delete_file(self.seaweedfs_file_id) diff --git a/meta_extractor/worker.py b/meta_extractor/worker.py index 6daf06e3..2e2500f3 100644 --- a/meta_extractor/worker.py +++ b/meta_extractor/worker.py @@ -14,7 +14,10 @@ def worker(task_settings: Dict[str, Any]) -> None: response = {'force_reprocess': task.force_reprocess, 'callback': task.callback} if task.sample_found: - response['sample'] = task.sample.metadata + if not task.sample.is_zip: # Real samples + response['sample'] = task.sample.metadata + else: # Zip files with samples or even more zip files inside + task.split_into_subtasks_per_subfile() # Send the response api_connector.send_result(task.api_url, response)