Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion skyflow/utils/_skyflow_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class Error(Enum):
INVALID_PLAIN_TEXT_ENTITIES_IN_REIDENTIFY= f"{error_prefix} Validation error. The plainTextEntities field must be an array of DetectEntities enums. Specify a valid plainTextEntities."

INVALID_DEIDENTIFY_FILE_REQUEST= f"{error_prefix} Validation error. Invalid deidentify file request. Specify a valid deidentify file request."
INVALID_DEIDENTIFY_FILE_INPUT= f"{error_prefix} Validation error. Invalid deidentify file input. Please provide either a file or a file path."
EMPTY_FILE_OBJECT= f"{error_prefix} Validation error. File object cannot be empty. Specify a valid file object."
INVALID_FILE_FORMAT= f"{error_prefix} Validation error. Invalid file format. Specify a valid file format."
MISSING_FILE_SOURCE= f"{error_prefix} Validation error. Provide exactly one of filePath, base64, or fileObject."
Expand Down Expand Up @@ -197,7 +198,7 @@ class Error(Enum):
INVALID_FILE_OR_ENCODED_FILE= f"{error_prefix} . Error while decoding base64 and saving file"
INVALID_FILE_TYPE = f"{error_prefix} Validation error. Invalid file type. Specify a valid file type."
INVALID_FILE_NAME= f"{error_prefix} Validation error. Invalid file name. Specify a valid file name."
FILE_READ_ERROR= f"{error_prefix} Validation error. Unable to read file. Verify the file path."
INVALID_DEIDENTIFY_FILE_PATH= f"{error_prefix} Validation error. Invalid file path. Specify a valid file path."
INVALID_BASE64_HEADER= f"{error_prefix} Validation error. Invalid base64 header. Specify a valid base64 header."
INVALID_WAIT_TIME= f"{error_prefix} Validation error. Invalid wait time. Specify a valid wait time as number and should not be greater than 64 secs."
INVALID_OUTPUT_DIRECTORY= f"{error_prefix} Validation error. Invalid output directory. Specify a valid output directory as string."
Expand Down
34 changes: 34 additions & 0 deletions skyflow/utils/validations/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from skyflow.utils.logger import log_info, log_error_log
from skyflow.vault.detect import DeidentifyTextRequest, ReidentifyTextRequest, TokenFormat, Transformations, \
GetDetectRunRequest, Bleep, DeidentifyFileRequest
from skyflow.vault.detect._file_input import FileInput

valid_vault_config_keys = ["vault_id", "cluster_id", "credentials", "env"]
valid_connection_config_keys = ["connection_id", "connection_url", "credentials"]
Expand Down Expand Up @@ -257,9 +258,42 @@ def validate_update_connection_config(logger, config):

return True

def validate_file_from_request(file_input: FileInput):
if file_input is None:
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_INPUT.value, invalid_input_error_code)

has_file = hasattr(file_input, 'file') and file_input.file is not None
has_file_path = hasattr(file_input, 'file_path') and file_input.file_path is not None

# Must provide exactly one of file or file_path
if (has_file and has_file_path) or (not has_file and not has_file_path):
raise SkyflowError(SkyflowMessages.Error.INVALID_DEIDENTIFY_FILE_INPUT.value, invalid_input_error_code)

if has_file:
file = file_input.file
# Validate file object has required attributes
if not hasattr(file, 'name') or not isinstance(file.name, str) or not file.name.strip():
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_TYPE.value, invalid_input_error_code)

# Validate file name
file_name = os.path.splitext(file.name)[0]
if not file_name or not file_name.strip():
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_NAME.value, invalid_input_error_code)

elif has_file_path:
file_path = file_input.file_path
if not isinstance(file_path, str) or not file_path.strip():
raise SkyflowError(SkyflowMessages.Error.INVALID_DEIDENTIFY_FILE_PATH.value, invalid_input_error_code)

if not os.path.exists(file_path) or not os.path.isfile(file_path):
raise SkyflowError(SkyflowMessages.Error.INVALID_DEIDENTIFY_FILE_PATH.value, invalid_input_error_code)

def validate_deidentify_file_request(logger, request: DeidentifyFileRequest):
if not hasattr(request, 'file') or request.file is None:
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_INPUT.value, invalid_input_error_code)

# Validate file input first
validate_file_from_request(request.file)

# Optional: entities
if hasattr(request, 'entities') and request.entities is not None:
Expand Down
34 changes: 29 additions & 5 deletions skyflow/vault/controller/_detect.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import json
import os
from skyflow.error import SkyflowError
Expand All @@ -20,6 +21,7 @@
from skyflow.vault.detect import DeidentifyTextRequest, DeidentifyTextResponse, ReidentifyTextRequest, \
ReidentifyTextResponse, DeidentifyFileRequest, DeidentifyFileResponse, GetDetectRunRequest


class Detect:
def __init__(self, vault_client):
self.__vault_client = vault_client
Expand Down Expand Up @@ -124,10 +126,22 @@ def output_to_dict_list(output):
word_count = getattr(word_character_count, "word_count", None)
char_count = getattr(word_character_count, "character_count", None)

base64_string = first_output.get("file", None)
extension = first_output.get("extension", None)

file_obj = None
if base64_string is not None:
file_bytes = base64.b64decode(base64_string)
file_obj = io.BytesIO(file_bytes)
file_obj.name = f"deidentified.{extension}" if extension else "processed_file"
else:
file_obj = None

return DeidentifyFileResponse(
file=first_output.get("file", None),
file_base64=base64_string,
file=file_obj, # File class will be instantiated in DeidentifyFileResponse
type=first_output.get("type", None),
extension=first_output.get("extension", None),
extension=extension,
word_count=word_count,
char_count=char_count,
size_in_kb=size,
Expand Down Expand Up @@ -216,16 +230,26 @@ def reidentify_text(self, request: ReidentifyTextRequest) -> ReidentifyTextRespo
log_error_log(SkyflowMessages.ErrorLogs.REIDENTIFY_TEXT_REQUEST_REJECTED.value, self.__vault_client.get_logger())
handle_exception(e, self.__vault_client.get_logger())

def __get_file_from_request(self, request: DeidentifyFileRequest):
file_input = request.file

# Check for file
if hasattr(file_input, 'file') and file_input.file is not None:
return file_input.file

# Check for file_path if file is not provided
if hasattr(file_input, 'file_path') and file_input.file_path is not None:
return open(file_input.file_path, 'rb')

def deidentify_file(self, request: DeidentifyFileRequest):
log_info(SkyflowMessages.Info.DETECT_FILE_TRIGGERED.value, self.__vault_client.get_logger())
validate_deidentify_file_request(self.__vault_client.get_logger(), request)
self.__initialize()
files_api = self.__vault_client.get_detect_file_api().with_raw_response
file_obj = request.file
file_obj = self.__get_file_from_request(request)
file_name = getattr(file_obj, 'name', None)
file_extension = self._get_file_extension(file_name) if file_name else None
file_content = file_obj.read()

base64_string = base64.b64encode(file_content).decode('utf-8')

try:
Expand Down Expand Up @@ -375,7 +399,7 @@ def deidentify_file(self, request: DeidentifyFileRequest):
file_name_only = 'processed-'+os.path.basename(file_name)
output_file_path = f"{request.output_directory}/{file_name_only}"
with open(output_file_path, 'wb') as output_file:
output_file.write(base64.b64decode(parsed_response.file))
output_file.write(base64.b64decode(parsed_response.file_base64))
log_info(SkyflowMessages.Info.DETECT_FILE_SUCCESS.value, self.__vault_client.get_logger())
return parsed_response

Expand Down
3 changes: 2 additions & 1 deletion skyflow/vault/detect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
from ._deidentify_file_request import DeidentifyFileRequest
from ._audio_bleep import Bleep
from ._deidentify_file_response import DeidentifyFileResponse
from ._get_detect_run_request import GetDetectRunRequest
from ._get_detect_run_request import GetDetectRunRequest
from ._file_input import FileInput
3 changes: 2 additions & 1 deletion skyflow/vault/detect/_deidentify_file_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from skyflow.vault.detect import TokenFormat, Transformations
from skyflow.vault.detect._audio_bleep import Bleep
from skyflow.utils.enums import MaskingMethod, DetectOutputTranscriptions
from skyflow.vault.detect._file_input import FileInput

class DeidentifyFileRequest:
def __init__(
Expand All @@ -24,7 +25,7 @@ def __init__(
output_directory: Optional[str] = None,
wait_time: Optional[Union[int, float]] = None
):
self.file: object = file
self.file: FileInput = file
self.entities: Optional[List[DetectEntities]] = entities
self.allow_regex_list: Optional[List[str]] = allow_regex_list
self.restrict_regex_list: Optional[List[str]] = restrict_regex_list
Expand Down
21 changes: 13 additions & 8 deletions skyflow/vault/detect/_deidentify_file_response.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import io
from skyflow.vault.detect._file import File

class DeidentifyFileResponse:
def __init__(
self,
file: str = None,
file_base64: str = None,
file: io.BytesIO = None,
type: str = None,
extension: str = None,
word_count: int = None,
Expand All @@ -15,7 +19,8 @@ def __init__(
status: str = None,
errors: list = [],
):
self.file = file
self.file_base64 = file_base64
self.file = File(file) if file else None
self.type = type
self.extension = extension
self.word_count = word_count
Expand All @@ -32,12 +37,12 @@ def __init__(
def __repr__(self):
return (
f"DeidentifyFileResponse("
f"file={self.file!r}, type={self.type!r}, extension={self.extension!r}, "
f"word_count={self.word_count!r}, char_count={self.char_count!r}, "
f"size_in_kb={self.size_in_kb!r}, duration_in_seconds={self.duration_in_seconds!r}, "
f"page_count={self.page_count!r}, slide_count={self.slide_count!r}, "
f"entities={self.entities!r}, run_id={self.run_id!r}, status={self.status!r}),"
f"errors={self.errors!r})"
f"file_base64={self.file_base64!r}, file={self.file!r}, type={self.type!r}, "
f"extension={self.extension!r}, word_count={self.word_count!r}, "
f"char_count={self.char_count!r}, size_in_kb={self.size_in_kb!r}, "
f"duration_in_seconds={self.duration_in_seconds!r}, page_count={self.page_count!r}, "
f"slide_count={self.slide_count!r}, entities={self.entities!r}, "
f"run_id={self.run_id!r}, status={self.status!r}, errors={self.errors!r})"
)

def __str__(self):
Expand Down
53 changes: 53 additions & 0 deletions skyflow/vault/detect/_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import io
import mimetypes
import time

class File:
def __init__(self, file: io.BytesIO = None):
self.file = file

@property
def name(self) -> str:
"""Get file name"""
if self.file:
return getattr(self.file, 'name', 'unknown')
return None

@property
def size(self) -> int:
"""Get file size in bytes"""
if self.file:
pos = self.file.tell()
self.file.seek(0, io.SEEK_END)
size = self.file.tell()
self.file.seek(pos)
return size
return None

@property
def type(self) -> str:
"""Get file mime type"""
if self.file:
return mimetypes.guess_type(self.name)[0] or ''
return None

@property
def last_modified(self) -> int:
"""Get file last modified timestamp in milliseconds"""
if self.file:
return int(time.time() * 1000)
return None

def seek(self, offset, whence=0):
if self.file:
return self.file.seek(offset, whence)

def read(self, size=-1):
if self.file:
return self.file.read(size)

def __repr__(self):
return (
f"File(name={self.name!r}, size={self.size!r}, type={self.type!r}, "
f"last_modified={self.last_modified!r})"
)
19 changes: 19 additions & 0 deletions skyflow/vault/detect/_file_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
class FileInput:
"""
Represents a file input for the vault detection process.

Attributes:
file (str): The file object to be processed. This can be a file-like object or a binary string.
file_path (str): The path to the file to be processed.
"""

def __init__(self, file: str= None, file_path: str = None):
self.file = file
self.file_path = file_path

def __repr__(self) -> str:
return f"FileInput(file={self.file!r}, file_path={self.file_path!r})"

def __str__(self) -> str:
return self.__repr__()

Loading
Loading