Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 34 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ table_name = '<SENSITIVE_DATA_TABLE>' # Replace with your actual table name

# Create Insert Request
insert_request = InsertRequest(
table_name=table_name,
table=table_name,
values=insert_data,
return_tokens=True, # Optional: Get tokens for inserted data
continue_on_error=True # Optional: Continue on partial errors
Expand Down Expand Up @@ -273,7 +273,7 @@ options = InsertOptions(

```python
insert_request = InsertRequest(
table_name=table_name, # Replace with the table name
table=table_name, # Replace with the table name
values=insert_data,
return_tokens=False, # Do not return tokens
continue_on_error=False, # Stop inserting if any record fails
Expand Down Expand Up @@ -474,7 +474,7 @@ try:

# Step 2: Create Insert Request
insert_request = InsertRequest(
table_name='table1', # Specify the table in the vault where the data will be inserted
table='table1', # Specify the table in the vault where the data will be inserted
values=insert_data, # Attach the data (records) to be inserted
return_tokens=True, # Specify if tokens should be returned upon successful insertion
continue_on_error=True # Optional: Continue on partial errors
Expand Down Expand Up @@ -551,7 +551,7 @@ try:

# Step 2: Build an InsertRequest object with the table name and the data to insert
insert_request = InsertRequest(
table_name='<TABLE_NAME>', # Replace with the actual table name in your Skyflow vault
table='<TABLE_NAME>', # Replace with the actual table name in your Skyflow vault
values=insert_data, # Attach the data to be inserted
)

Expand Down Expand Up @@ -608,7 +608,7 @@ try:

# Step 4: Build the InsertRequest object with the data records to insert
insert_request = InsertRequest(
table_name='table1', # Specify the table in the vault where the data will be inserted
table='table1', # Specify the table in the vault where the data will be inserted
values=insert_data, # Attach the data (records) to be inserted
return_tokens=True, # Specify if tokens should be returned upon successful insertion
continue_on_error=True # Specify to continue inserting records even if an error occurs for some records
Expand Down Expand Up @@ -686,7 +686,7 @@ try:

# Step 3: Build the InsertRequest object with the upsertData
insert_request = InsertRequest(
table_name='table1', # Specify the table in the vault where the data will be inserted
table='table1', # Specify the table in the vault where the data will be inserted
values=insert_data, # Attach the data (records) to be inserted
return_tokens=True, # Specify if tokens should be returned upon successful insertion
upsert='cardholder_name' # Specify the field to be used for upsert operations (e.g., cardholder_name)
Expand Down Expand Up @@ -1897,23 +1897,24 @@ ReidentifyTextResponse(
```

### Deidentify File
To deidentify files, use the `deidentify_file` method. The `DeidentifyFileRequest` class creates a deidentify file request, which includes the file to be deidentified and various configuration options.
To deidentify files, use the `deidentify_file` method. The `DeidentifyFileRequest` class creates a deidentify file request, supports providing either a file or a file path in class FileInput for de-identification, along with various configuration options.

#### Construct a Deidentify File request
```python
from skyflow.error import SkyflowError
from skyflow.utils.enums import DetectEntities, MaskingMethod, DetectOutputTranscriptions
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Transformations, Bleep
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Transformations, Bleep, FileInput
"""
This example demonstrates how to deidentify file, along with corresponding DeidentifyFileRequest schema.
"""
try:
# Initialize Skyflow client
# Step 1: Open file for deidentification
file = open('<FILE_PATH>', 'rb') # Open the file in read-binary mode
file_path="<FILE_PATH>"
file = open(file_path, 'rb') # Open the file in read-binary mode
# Step 2: Create deidentify file request
request = DeidentifyFileRequest(
file=file, # File object to deidentify
file=FileInput(file), # File to de-identify (can also provide a file path)
entities=[DetectEntities.SSN, DetectEntities.CREDIT_CARD], # Entities to detect

# Token format configuration
Expand Down Expand Up @@ -1971,7 +1972,7 @@ except Exception as error:
```python
from skyflow.error import SkyflowError
from skyflow.utils.enums import DetectEntities, MaskingMethod, DetectOutputTranscriptions
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Bleep
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Bleep, FileInput
"""
* Skyflow Deidentify File Example
*
Expand All @@ -1985,7 +1986,7 @@ try:
file = open('sensitive_document.txt', 'rb') # Open the file in read-binary mode
# Step 2: Create deidentify file request
request = DeidentifyFileRequest(
file=file, # File object to deidentify
file=FileInput(file), # File to de-identify (can also provide a file path)
entities=[
DetectEntities.SSN,
DetectEntities.CREDIT_CARD
Expand Down Expand Up @@ -2038,7 +2039,6 @@ DeidentifyFileResponse(
],
run_id='83abcdef-2b61-4a83-a4e0-cbc71ffabffd',
status='SUCCESS',
errors=[]
)
```

Expand Down Expand Up @@ -2121,7 +2121,7 @@ except Exception as error:
print('Unexpected Error:', error) # Print the stack trace for debugging purposes
```

Sample Response
Sample Response:
```python
DeidentifyFileResponse(
file='TXkgY2FyZCBudW1iZXIgaXMgW0NSRURJVF9DQVJEXQpteSBzZWNvbmQ…', # Base64 encoded file content
Expand All @@ -2142,7 +2142,26 @@ DeidentifyFileResponse(
],
run_id='48ec05ba-96ec-4641-a8e2-35e066afef95',
status='SUCCESS',
errors=[]
)
```

Incase of invalid/expired RunId:

```python
DeidentifyFileResponse(
file_base64=None,
file=None,
type='UNKNOWN',
extension=None,
word_count=None,
char_count=None,
size_in_kb=0.0,
duration_in_seconds=None,
page_count=None,
slide_count=None,
entities=[],
run_id='1e9f321f-dd51-4ab1-a014-21212fsdfsd',
status='UNKNOWN'
)
```

Expand Down
4 changes: 2 additions & 2 deletions samples/detect_api/deidentify_file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from skyflow.error import SkyflowError
from skyflow import Env, Skyflow, LogLevel
from skyflow.utils.enums import DetectEntities, MaskingMethod, DetectOutputTranscriptions
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Transformations, DateTransformation, Bleep
from skyflow.vault.detect import DeidentifyFileRequest, TokenFormat, Transformations, DateTransformation, Bleep, FileInput

"""
* Skyflow Deidentify File Example
Expand Down Expand Up @@ -39,7 +39,7 @@ def perform_file_deidentification():
file = open(file_path, 'rb')
# Step 5: Configure Deidentify File Request with all options
deidentify_request = DeidentifyFileRequest(
file=file, # File object to deidentify
file=FileInput(file), # File to de-identify (can also provide a file path)
entities=[DetectEntities.SSN, DetectEntities.CREDIT_CARD], # Entities to detect
allow_regex_list=['<YOUR_REGEX_PATTERN>'], # Optional: Patterns to allow
restrict_regex_list=['<YOUR_REGEX_PATTERN>'], # Optional: Patterns to restrict
Expand Down
2 changes: 1 addition & 1 deletion samples/vault_api/insert_byot.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def perform_secure_data_insertion_with_byot():
]

insert_request = InsertRequest(
table_name=table_name,
table=table_name,
values=insert_data,
token_mode=TokenMode.ENABLE, # Enable Bring Your Own Token (BYOT)
tokens=tokens, # Specify tokens to use for BYOT
Expand Down
2 changes: 1 addition & 1 deletion samples/vault_api/insert_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def perform_secure_data_insertion():

# Step 5: Create Insert Request
insert_request = InsertRequest(
table_name=table_name,
table=table_name,
values=insert_data,
return_tokens=True, # Optional: Get tokens for inserted data
continue_on_error=True # Optional: Continue on partial errors
Expand Down
1 change: 1 addition & 0 deletions skyflow/utils/_skyflow_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ class ErrorLogs(Enum):
DEIDENTIFY_FILE_REQUEST_REJECTED = f"{ERROR}: [{error_prefix}] Deidentify file resulted in failure."
DETECT_RUN_REQUEST_REJECTED = f"{ERROR}: [{error_prefix}] Detect get run resulted in failure."
DEIDENTIFY_TEXT_REQUEST_REJECTED = f"{ERROR}: [{error_prefix}] Deidentify text resulted in failure."
SAVING_DEIDENTIFY_FILE_FAILED = f"{ERROR}: [{error_prefix}] Error while saving deidentified file to output directory."
REIDENTIFY_TEXT_REQUEST_REJECTED = f"{ERROR}: [{error_prefix}] Reidentify text resulted in failure."
DETECT_FILE_REQUEST_REJECTED = f"{ERROR}: [{error_prefix}] Deidentify file resulted in failure."

Expand Down
3 changes: 3 additions & 0 deletions skyflow/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,6 @@ def encode_column_values(get_request):
encoded_column_values.append(quote(column))

return encoded_column_values

def get_attribute(obj, camel_case, snake_case):
return getattr(obj, camel_case, None) or getattr(obj, snake_case, None)
6 changes: 3 additions & 3 deletions skyflow/utils/validations/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def validate_file_from_request(file_input: FileInput):
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_TYPE.value, invalid_input_error_code)

# Validate file name
file_name = os.path.splitext(file.name)[0]
file_name, _ = os.path.splitext(os.path.basename(file.name))
if not file_name or not file_name.strip():
raise SkyflowError(SkyflowMessages.Error.INVALID_FILE_NAME.value, invalid_input_error_code)

Expand Down Expand Up @@ -393,10 +393,10 @@ def validate_deidentify_file_request(logger, request: DeidentifyFileRequest):
raise SkyflowError(SkyflowMessages.Error.WAIT_TIME_GREATER_THEN_64.value, invalid_input_error_code)

def validate_insert_request(logger, request):
if not isinstance(request.table_name, str):
if not isinstance(request.table, str):
log_error_log(SkyflowMessages.ErrorLogs.TABLE_IS_REQUIRED.value.format("INSERT"), logger = logger)
raise SkyflowError(SkyflowMessages.Error.INVALID_TABLE_NAME_IN_INSERT.value, invalid_input_error_code)
if not request.table_name.strip():
if not request.table.strip():
log_error_log(SkyflowMessages.ErrorLogs.EMPTY_TABLE_NAME.value.format("INSERT"), logger = logger)
raise SkyflowError(SkyflowMessages.Error.MISSING_TABLE_NAME_IN_INSERT.value, invalid_input_error_code)

Expand Down
57 changes: 47 additions & 10 deletions skyflow/vault/controller/_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from skyflow.generated.rest import DeidentifyTextRequestFile, DeidentifyAudioRequestFile, DeidentifyPdfRequestFile, \
DeidentifyImageRequestFile, DeidentifyPresentationRequestFile, DeidentifySpreadsheetRequestFile, \
DeidentifyDocumentRequestFile, DeidentifyFileRequestFile
from skyflow.generated.rest.types.deidentify_status_response import DeidentifyStatusResponse
from skyflow.utils._skyflow_messages import SkyflowMessages
from skyflow.utils._utils import get_metrics, handle_exception, parse_deidentify_text_response, parse_reidentify_text_response
from skyflow.utils._utils import get_attribute, get_metrics, handle_exception, parse_deidentify_text_response, parse_reidentify_text_response
from skyflow.utils.constants import SKY_META_DATA_HEADER
from skyflow.utils.logger import log_info, log_error_log
from skyflow.utils.validations import validate_deidentify_file_request, validate_get_detect_run_request
Expand Down Expand Up @@ -83,6 +84,43 @@ def __poll_for_processed_file(self, run_id, max_wait_time=64):
except Exception as e:
raise e

def __save_deidentify_file_response_output(self, response: DeidentifyStatusResponse, output_directory: str, original_file_name: str, name_without_ext: str):
if not response or not hasattr(response, 'output') or not response.output or not output_directory:
return

if not os.path.exists(output_directory):
return

deidentify_file_prefix = "processed-"
output_list = response.output

base_original_filename = os.path.basename(original_file_name)
base_name_without_ext = os.path.splitext(base_original_filename)[0]

for idx, output in enumerate(output_list):
try:
processed_file = get_attribute(output, 'processedFile', 'processed_file')
processed_file_type = get_attribute(output, 'processedFileType', 'processed_file_type')
processed_file_extension = get_attribute(output, 'processedFileExtension', 'processed_file_extension')

if not processed_file:
continue

decoded_data = base64.b64decode(processed_file)

if idx == 0 or processed_file_type == 'redacted_file':
output_file_name = os.path.join(output_directory, deidentify_file_prefix + base_original_filename)
if processed_file_extension:
output_file_name = os.path.join(output_directory, f"{deidentify_file_prefix}{base_name_without_ext}.{processed_file_extension}")
else:
output_file_name = os.path.join(output_directory, f"{deidentify_file_prefix}{base_name_without_ext}.{processed_file_extension}")

with open(output_file_name, 'wb') as f:
f.write(decoded_data)
except Exception as e:
log_error_log(SkyflowMessages.ErrorLogs.SAVING_DEIDENTIFY_FILE_FAILED.value, self.__vault_client.get_logger())
handle_exception(e, self.__vault_client.get_logger())

def __parse_deidentify_file_response(self, data, run_id=None, status=None):
output = getattr(data, "output", [])
status_val = getattr(data, "status", None) or status
Expand Down Expand Up @@ -141,8 +179,8 @@ def output_to_dict_list(output):

return DeidentifyFileResponse(
file_base64=base64_string,
file=file_obj, # File class will be instantiated in DeidentifyFileResponse
type=first_output.get("type", None),
file=file_obj,
type=first_output.get("type", "UNKNOWN"),
extension=extension,
word_count=word_count,
char_count=char_count,
Expand All @@ -153,7 +191,6 @@ def output_to_dict_list(output):
entities=entities,
run_id=run_id_val,
status=status_val,
errors=None
)

def __get_token_format(self, request):
Expand Down Expand Up @@ -396,12 +433,11 @@ def deidentify_file(self, request: DeidentifyFileRequest):
run_id = getattr(api_response.data, 'run_id', None)

processed_response = self.__poll_for_processed_file(run_id, request.wait_time)
parsed_response = self.__parse_deidentify_file_response(processed_response, run_id)
if request.output_directory and processed_response.status == 'SUCCESS':
file_name_only = 'processed-'+os.path.basename(file_name)
output_file_path = f"{request.output_directory}/{file_name_only}"
with open(output_file_path, 'wb') as output_file:
output_file.write(base64.b64decode(parsed_response.file_base64))
name_without_ext, _ = os.path.splitext(file_name)
self.__save_deidentify_file_response_output(processed_response, request.output_directory, file_name, name_without_ext)

parsed_response = self.__parse_deidentify_file_response(processed_response, run_id)
log_info(SkyflowMessages.Info.DETECT_FILE_SUCCESS.value, self.__vault_client.get_logger())
return parsed_response

Expand All @@ -411,9 +447,9 @@ def deidentify_file(self, request: DeidentifyFileRequest):
handle_exception(e, self.__vault_client.get_logger())

def get_detect_run(self, request: GetDetectRunRequest):
log_info(SkyflowMessages.Info.GET_DETECT_RUN_TRIGGERED.value,self.__vault_client.get_logger())
log_info(SkyflowMessages.Info.VALIDATING_GET_DETECT_RUN_INPUT.value, self.__vault_client.get_logger())
validate_get_detect_run_request(self.__vault_client.get_logger(), request)
log_info(SkyflowMessages.Info.DEIDENTIFY_TEXT_REQUEST_RESOLVED.value, self.__vault_client.get_logger())
self.__initialize()

files_api = self.__vault_client.get_detect_file_api().with_raw_response
Expand All @@ -428,6 +464,7 @@ def get_detect_run(self, request: GetDetectRunRequest):
parsed_response = self.__parse_deidentify_file_response(DeidentifyFileResponse(run_id=run_id, status='IN_PROGRESS'))
else:
parsed_response = self.__parse_deidentify_file_response(response.data, run_id, response.data.status)
log_info(SkyflowMessages.Info.GET_DETECT_RUN_SUCCESS.value,self.__vault_client.get_logger())
return parsed_response
except Exception as e:
log_error_log(SkyflowMessages.ErrorLogs.DETECT_FILE_REQUEST_REJECTED.value,
Expand Down
7 changes: 4 additions & 3 deletions skyflow/vault/controller/_vault.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
parse_tokenize_response, parse_query_response, parse_get_response, encode_column_values, get_metrics
from skyflow.utils.constants import SKY_META_DATA_HEADER
from skyflow.utils.enums import RequestMethod
from skyflow.utils.enums.redaction_type import RedactionType
from skyflow.utils.logger import log_info, log_error_log
from skyflow.utils.validations import validate_insert_request, validate_delete_request, validate_query_request, \
validate_get_request, validate_update_request, validate_detokenize_request, validate_tokenize_request
Expand Down Expand Up @@ -53,7 +54,7 @@ def __build_insert_body(self, request: InsertRequest):
records_list = self.__build_batch_field_records(
request.values,
request.tokens,
request.table_name,
request.table,
request.return_tokens,
request.upsert
)
Expand Down Expand Up @@ -85,7 +86,7 @@ def insert(self, request: InsertRequest):

else:
api_response = records_api.record_service_insert_record(self.__vault_client.get_vault_id(),
request.table_name, records=insert_body,tokenization= request.return_tokens, upsert=request.upsert, homogeneous=request.homogeneous, byot=request.token_mode.value, request_options=self.__get_headers())
request.table, records=insert_body,tokenization= request.return_tokens, upsert=request.upsert, homogeneous=request.homogeneous, byot=request.token_mode.value, request_options=self.__get_headers())

insert_response = parse_insert_response(api_response, request.continue_on_error)
log_info(SkyflowMessages.Info.INSERT_SUCCESS.value, self.__vault_client.get_logger())
Expand Down Expand Up @@ -201,7 +202,7 @@ def detokenize(self, request: DetokenizeRequest):
tokens_list = [
V1DetokenizeRecordRequest(
token=item.get('token'),
redaction=item.get('redaction', None)
redaction=item.get('redaction', RedactionType.DEFAULT)
)
for item in request.data
]
Expand Down
4 changes: 2 additions & 2 deletions skyflow/vault/data/_insert_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

class InsertRequest:
def __init__(self,
table_name,
table,
values,
tokens = None,
upsert = None,
homogeneous = False,
token_mode = TokenMode.DISABLE,
return_tokens = True,
continue_on_error = False):
self.table_name = table_name
self.table = table
self.values = values
self.tokens = tokens
self.upsert = upsert
Expand Down
Loading
Loading