Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions meta_extractor/classifier/classifiers.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
from .file_utils import (mime_type, has_csharp_description, pe_mime_types, flash_mime_types, apk_mime_types,
java_mime_types, zip_and_jar_shared_mime_types, has_java_structure, maybe_zip_mime_types,
has_zip_structure)
from .file_utils import (mime_type, has_csharp_description, pe_mime_types, flash_mime_types, java_mime_types,
has_java_structure, zip_mime_types, has_zip_structure)


def pe_classifier(data) -> bool:
return mime_type(data) in pe_mime_types and has_csharp_description(data)
class CSharpClassifier:
file_type = 'pe'

def match(self, data: bytes) -> bool:
return mime_type(data) in pe_mime_types and has_csharp_description(data)

def flash_classifier(data) -> bool:
return mime_type(data) in flash_mime_types

class FlashClassifier:
file_type = 'flash'

def apk_classifier(data) -> bool:
return mime_type(data) in apk_mime_types
def match(self, data: bytes) -> bool:
return mime_type(data) in flash_mime_types


def java_classifier(data) -> bool:
return mime_type(data) in java_mime_types or (mime_type(data) in zip_and_jar_shared_mime_types
and has_java_structure(data))
class JavaClassifier:
file_type = 'java'

def match(self, data: bytes) -> bool:
return mime_type(data) in java_mime_types or (mime_type(data) in zip_mime_types and has_java_structure(data))

def zip_classifier(data) -> bool:
return ((mime_type(data) in maybe_zip_mime_types and has_zip_structure(data))
or (mime_type(data) in zip_and_jar_shared_mime_types and not java_classifier(data)))

class ZipClassifier:
file_type = 'zip'

CLASSIFIERS = {"pe": pe_classifier,
"flash": flash_classifier,
"apk": apk_classifier,
"java": java_classifier,
"zip": zip_classifier}
def match(self, data: bytes) -> bool:
return mime_type(data) in zip_mime_types and has_zip_structure(data) and not has_java_structure(data)


# Order is important! For instance, if you put zip classifier first, java files will be detected as zips.
CLASSIFIERS = [CSharpClassifier(), FlashClassifier(), JavaClassifier(), ZipClassifier()]
8 changes: 4 additions & 4 deletions meta_extractor/classifier/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from .classifiers import CLASSIFIERS


def get_identifier_of_file(sample: bytes) -> str:
def get_identifier_of_file(sample_content: bytes) -> str:
sample_identifier = None
for identifier, classifier_function in CLASSIFIERS.items():
if classifier_function(sample):
sample_identifier = identifier
for classifier in CLASSIFIERS:
if classifier.match(sample_content):
sample_identifier = classifier.file_type
break
logging.info(f'File type detected: {sample_identifier}')
return sample_identifier
11 changes: 3 additions & 8 deletions meta_extractor/classifier/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,9 @@
'application/vnd.adobe.flash.movie']


apk_mime_types = ['application/vnd.android.package-archive']


zip_and_jar_shared_mime_types = ['application/zip',
'application/x-zip-compressed',
'multipart/x-zip']

maybe_zip_mime_types = ['application/octet-stream']
zip_mime_types = ['application/zip',
'application/x-zip-compressed',
'multipart/x-zip']


java_mime_types = ['application/x-java-archive',
Expand Down
15 changes: 13 additions & 2 deletions meta_extractor/redis/requeue.py → meta_extractor/redis/queue.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import pickle
pickle.HIGHEST_PROTOCOL = 4
from rq import Queue
from rq.queue import Job
from redis import Redis
import logging


class TaskRequeuer:
class TaskQueue:
def __init__(self):
self.connection = Redis(host='daas_redis_task_queue_1')
# Where to look for decompilers' code
Expand All @@ -15,6 +16,16 @@ def __init__(self):
connection=self.connection,
default_timeout=1200)

def _enqueue(self, task_arguments: dict) -> Job:
return self.queue.enqueue(self.worker_path, args=(task_arguments,))

def requeue(self, task_arguments: dict) -> None:
task = self.queue.enqueue(self.worker_path, args=(task_arguments,))
task = self._enqueue(task_arguments)
logging.info(f'Service is unavailable for the given url. Task requeued. {task_arguments["external_url"]=}, {task.id=}')

def add_subfile_to_queue(self, old_task_arguments: dict, seaweedfs_file_id: str) -> None:
new_task_parameters = old_task_arguments.copy()
new_task_parameters['external_url'] = None
new_task_parameters['seaweedfs_file_id'] = seaweedfs_file_id
task = self._enqueue(new_task_parameters)
logging.info(f'Subfile of a zip file added to queue as new file with {task.id=}')
9 changes: 7 additions & 2 deletions meta_extractor/redis/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging

from ..seaweed import seaweedfs
from .requeue import TaskRequeuer
from .queue import TaskQueue
from ..sample import Sample


Expand Down Expand Up @@ -34,7 +34,7 @@ def _get_sample_content(self, file_name: str) -> Tuple[Optional[bytes], str]:
if content := self._get_sample_content_from_external_url(self.external_url): # Sample downloaded
self.seaweedfs_file_id = seaweedfs.upload_file(stream=content, name=file_name)
else: # Sample not accessible at the moment through the given URL
TaskRequeuer().requeue(self.settings)
TaskQueue().requeue(self.settings)
raise Exception('Sample not downloadable. Task requeued with low priority.')
else:
logging.error('Tasks should have one the following parameters: seaweedfs_file_id or external_url.\n' +
Expand All @@ -54,3 +54,8 @@ def _get_sample_content_from_external_url(self, download_url: str) -> Optional[b
@property
def sample_found(self) -> bool:
return self.sample is not None

def split_into_subtasks_per_subfile(self) -> None:
for subfile in self.sample.subfiles:
subfile_seaweedfs_file_id = seaweedfs.upload_file(stream=subfile.content, name=subfile.file_name)
TaskQueue().add_subfile_to_queue(self.settings, subfile_seaweedfs_file_id)
7 changes: 6 additions & 1 deletion meta_extractor/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .seaweed import seaweedfs
from .classifier.classify import get_identifier_of_file
from .classifier.file_utils import get_in_memory_zip_of
from .redis.queue import TaskQueue


class Sample:
Expand All @@ -16,7 +17,7 @@ def __init__(self, file_name: str, content: bytes, password: bytes,
self.uploaded_on = uploaded_on
self.file_type = get_identifier_of_file(self.content)
self.subfiles = []
if self.file_type == 'zip':
if self.is_zip:
self._load_subfiles()

def _load_subfiles(self) -> None:
Expand Down Expand Up @@ -62,6 +63,10 @@ def metadata(self) -> dict:
'file_name': self.file_name,
'subfiles': [subfile.metadata for subfile in self.subfiles]}

@property
def is_zip(self):
return self.file_type == 'zip'

def delete_from_seaweedfs(self) -> None:
if self.seaweedfs_file_id:
seaweedfs.delete_file(self.seaweedfs_file_id)
5 changes: 4 additions & 1 deletion meta_extractor/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ def worker(task_settings: Dict[str, Any]) -> None:
response = {'force_reprocess': task.force_reprocess,
'callback': task.callback}
if task.sample_found:
response['sample'] = task.sample.metadata
if not task.sample.is_zip: # Real samples
response['sample'] = task.sample.metadata
else: # Zip files with samples or even more zip files inside
task.split_into_subtasks_per_subfile()

# Send the response
api_connector.send_result(task.api_url, response)
Expand Down