diff --git a/backend/audit/intake_to_dissemination.py b/backend/audit/intake_to_dissemination.py index 38330d407b..4be6a92b46 100644 --- a/backend/audit/intake_to_dissemination.py +++ b/backend/audit/intake_to_dissemination.py @@ -1,7 +1,11 @@ import logging import pytz from django.db import IntegrityError +from hashlib import sha1 +from dateutil.parser import parse +from datetime import datetime +from django.forms.models import model_to_dict from audit.intakelib.transforms.xform_resize_award_references import _format_reference from audit.models.constants import RESUBMISSION_STATUS from audit.utils import Util @@ -20,6 +24,7 @@ SecondaryAuditor, ) +from dissemination.summary_reports import field_name_ordered logger = logging.getLogger(__name__) @@ -29,6 +34,61 @@ def omit(remove, d) -> dict: return {k: d[k] for k in d if k not in remove} +# Date-type things need to be converted from datetimes to dates. +def convert_to_string(o): + if isinstance(o, datetime): + return f"{o.date()}" + if o is None: + return "" + else: + return f"{o}" + + +def hashable_types(o): + logger.info(f"{type(o)} {o}") + return o + + +# This is used to calculate a hash of the data for both internal and external integrity. +def hash_dissemination_object(obj): + # Given a hash, alpha sort the keys. We do this by taking + # the object to a list of tuples, and then sorting + # the resulting list on the first element of the tuple. + # + # See https://stackoverflow.com/a/22003440 + # for reference. It isn't obvious how to do this well, and in particular, + # while leaving the JSON object keys out of the hash. + + # 1. Get the fields we're going to hash from the object + fields_to_hash = obj.HASH_FIELDS + # 2. We are given a Django object. Convert it to a dictionary. + d = model_to_dict(obj) + # 3. Dictionary to tuples + tupes = list(d.items()) + # 4. Tuples sorted by key + sorted_tupes = sorted(tupes, key=lambda k: k[0]) + # 5. Get rid of fields that we're not hashing + filtered_sorted = list(filter(lambda t: t[0] in fields_to_hash, sorted_tupes)) + # 6. Strip the keys + # Why strip the keys? We don't want our field names to impact + # the hashing value. We want to make sure the values in the object, in a consistent sort + # order, are what get hashed. If we change field names, yes, the hash will change. But + # our object field names are very consistent. + # It is unclear if we're going to get consistent, cross-language hashing here. + # It depends on how Python chooses to reprseent values as strings. If we don't quite get this right + # the first time, it will have to be improved, and the full dataset re-disseminated. + # p[0] is the key, p[1] is the value in the tuple list. + # Strings must be encoded to bytes before hashing. + just_values = list(map(lambda p: convert_to_string(p[1]), filtered_sorted)) + # 7. Append the values with no spaces. + smooshed = "".join(just_values).strip().encode("ascii", "ignore") + # This is now hashable. Run a SHA1. + shaobj = sha1() + shaobj.update(smooshed) + digest = shaobj.hexdigest() + return digest + + class IntakeToDissemination(object): DISSEMINATION = "dissemination" PRE_CERTIFICATION_REVIEW = "pre_certification_review" @@ -75,6 +135,10 @@ def save_dissemination_objects(self): for key, object_list in self.loaded_objects.items(): try: if object_list: + # # Add the hashes at the last possible moment. + for obj in object_list: + sha = hash_dissemination_object(obj) + obj.hash = sha model_class = type(object_list[0]) model_class.objects.bulk_create(object_list) except IntegrityError as e: diff --git a/backend/audit/migrations/0030_singleauditchecklist_hash_singleauditreportfile_hash.py b/backend/audit/migrations/0030_singleauditchecklist_hash_singleauditreportfile_hash.py new file mode 100644 index 0000000000..f7547eba9d --- /dev/null +++ b/backend/audit/migrations/0030_singleauditchecklist_hash_singleauditreportfile_hash.py @@ -0,0 +1,27 @@ +# Generated by Django 5.2.3 on 2025-09-12 11:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("audit", "0029_alter_submissionevent_event"), + ] + + operations = [ + migrations.AddField( + model_name="singleauditchecklist", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="singleauditreportfile", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the report", null=True + ), + ), + ] diff --git a/backend/audit/models/files.py b/backend/audit/models/files.py index 1c05254e6c..bf480c37cb 100644 --- a/backend/audit/models/files.py +++ b/backend/audit/models/files.py @@ -104,6 +104,18 @@ class SingleAuditReportFile(models.Model): component_page_numbers = models.JSONField( blank=True, null=True, validators=[validate_component_page_numbers] ) + # TODO: This value probably wants to be calculated at the point that the file is uploaded. + # If we do it on save(), it means we have to pull the object from S3 for hashing. It might + # be that we want to use the S3 checksum, however, instead of computing it ourselves: + # https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + # That would have the benefit of storying the S3 hash in our application, and being something + # we can then verify between the DB and the store. However, it would be good if users could verify + # the checksum on the PDFs as well. + hash = models.CharField( + help_text="A hash of the report", + blank=True, + null=True, + ) def save(self, *args, **kwargs): report_id = self.sac.report_id diff --git a/backend/audit/models/models.py b/backend/audit/models/models.py index 1f3ad6efb7..050f3a4158 100644 --- a/backend/audit/models/models.py +++ b/backend/audit/models/models.py @@ -255,20 +255,24 @@ def disseminate(self): intake_to_dissem.load_all() intake_to_dissem.save_dissemination_objects() if intake_to_dissem.errors: + logger.info(f"I2D ERRORS: {intake_to_dissem.errors}") return {"errors": intake_to_dissem.errors} except TransactionManagementError as err: # We want to re-raise this to catch at the view level because we # think it's due to a race condition where the user's submission # has been disseminated successfully; see # https://github.com/GSA-TTS/FAC/issues/3347 + logger.error("Possible race in disseminate") raise err # TODO: figure out what narrower exceptions to catch here except Exception as err: + logger.error(f"Unknown error in disseminate: {err}") return {"errors": [err]} return None def redisseminate(self): + named_models = { "AdditionalEins": AdditionalEin, "AdditionalUeis": AdditionalUei, @@ -585,6 +589,21 @@ def get_statuses(self) -> type[STATUS]: blank=True, null=True, help_text="Resubmission JSON structure" ) + # Data hash for integrity + # This can be empty/null while data is being created, but it must be not-null + # at the point of dissemination. It should be the case that it is a hash of data, not fields, + # and the hash should match when calculated both on the internal table/data as well as the external data. + # That is, it should be possible to verify the hash via the API. Therefore, we compute this after + # submission, or as part of intake->dissemination. We store it in the internal table because it is then + # something that we expect to NOT CHANGE over time. (Resubmission metadata is not part of the hash.) Why? + # Because it is internal/administrative data tracking the connectedness of audits, not the audit data itself. + # (A later group/team may decide this is an incorrect decision.) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) + def validate_full(self): """ Full validation, intended for use when the user indicates that the diff --git a/backend/audit/views/submission_progress_view.py b/backend/audit/views/submission_progress_view.py index 549e8afea9..eb64948f48 100644 --- a/backend/audit/views/submission_progress_view.py +++ b/backend/audit/views/submission_progress_view.py @@ -145,7 +145,10 @@ def get(self, request, *args, **kwargs): shaped_audit = None audit_subcheck = None - _compare_progress_check(subcheck, audit_subcheck) + # MCJ 20250911 this should have been gated behind an `if`. + if audit: + _compare_progress_check(subcheck, audit_subcheck) + # Update with the view-specific info from SECTIONS_BASE: for key, value in SECTIONS_BASE.items(): subcheck[key] = subcheck[key] | value diff --git a/backend/dissemination/api/api_v1_1_0/create_views.sql b/backend/dissemination/api/api_v1_1_0/create_views.sql index 98dc3ba23a..95da0459da 100644 --- a/backend/dissemination/api/api_v1_1_0/create_views.sql +++ b/backend/dissemination/api/api_v1_1_0/create_views.sql @@ -8,9 +8,12 @@ create view api_v1_1_0.findings_text as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, + --- ft.finding_ref_number, ft.contains_chart_or_table, - ft.finding_text + ft.finding_text, + ft.hash from dissemination_findingtext ft, dissemination_general gen @@ -30,8 +33,10 @@ create view api_v1_1_0.additional_ueis as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- - uei.additional_uei + uei.additional_uei, + uei.hash from dissemination_general gen, dissemination_additionaluei uei @@ -48,6 +53,8 @@ create view api_v1_1_0.findings as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, + --- finding.award_reference, finding.reference_number, finding.is_material_weakness, @@ -58,7 +65,8 @@ create view api_v1_1_0.findings as finding.is_questioned_costs, finding.is_repeat_finding, finding.is_significant_deficiency, - finding.type_requirement + finding.type_requirement, + finding.hash from dissemination_finding finding, dissemination_general gen @@ -75,6 +83,7 @@ create view api_v1_1_0.federal_awards as award.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- award.award_reference, award.federal_agency_prefix, @@ -94,7 +103,8 @@ create view api_v1_1_0.federal_awards as award.audit_report_type, award.findings_count, award.is_passthrough_award, - award.passthrough_amount + award.passthrough_amount, + award.hash from dissemination_federalaward award, dissemination_general gen @@ -112,10 +122,12 @@ create view api_v1_1_0.corrective_action_plans as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- ct.finding_ref_number, ct.contains_chart_or_table, - ct.planned_action + ct.planned_action, + ct.hash from dissemination_CAPText ct, dissemination_General gen @@ -135,13 +147,15 @@ create view api_v1_1_0.notes_to_sefa as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- note.note_title as title, note.accounting_policies, note.is_minimis_rate_used, note.rate_explained, note.content, - note.contains_chart_or_table + note.contains_chart_or_table, + note.hash from dissemination_general gen, dissemination_note note @@ -161,10 +175,12 @@ create view api_v1_1_0.passthrough as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- pass.award_reference, pass.passthrough_id, - pass.passthrough_name + pass.passthrough_name, + pass.hash from dissemination_general as gen, dissemination_passthrough as pass @@ -254,7 +270,8 @@ create view api_v1_1_0.general as CASE EXISTS(SELECT aud.report_id FROM dissemination_secondaryauditor aud WHERE aud.report_id = gen.report_id) WHEN FALSE THEN 'No' ELSE 'Yes' - END AS is_secondary_auditors + END AS is_secondary_auditors, + gen.hash from dissemination_general gen order by gen.id @@ -268,6 +285,7 @@ create view api_v1_1_0.secondary_auditors as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- sa.auditor_ein, sa.auditor_name, @@ -278,7 +296,8 @@ create view api_v1_1_0.secondary_auditors as sa.address_street, sa.address_city, sa.address_state, - sa.address_zipcode + sa.address_zipcode, + sa.hash from dissemination_General gen, dissemination_SecondaryAuditor sa @@ -292,8 +311,10 @@ create view api_v1_1_0.additional_eins as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- - ein.additional_ein + ein.additional_ein, + ein.hash from dissemination_general gen, dissemination_additionalein ein @@ -307,11 +328,13 @@ create view api_v1_1_0.resubmission as gen.report_id, gen.auditee_uei, gen.audit_year, + gen.fac_accepted_date, --- resub.version, resub.status, resub.previous_report_id, - resub.next_report_id + resub.next_report_id, + resub.hash from dissemination_general gen, dissemination_resubmission resub diff --git a/backend/dissemination/management/commands/delete_and_regenerate_dissemination_from_intake.py b/backend/dissemination/management/commands/delete_and_regenerate_dissemination_from_intake.py index 3972115354..bafc158a09 100644 --- a/backend/dissemination/management/commands/delete_and_regenerate_dissemination_from_intake.py +++ b/backend/dissemination/management/commands/delete_and_regenerate_dissemination_from_intake.py @@ -56,7 +56,6 @@ def handle(self, *args, **_kwargs): try: sac = SingleAuditChecklist.objects.get(report_id=report_id) sac.redisseminate() - logger.info(f"Redisseminated: {report_id}") exit(0) except SingleAuditChecklist.DoesNotExist: logger.info(f"No report with report_id found: {report_id}") diff --git a/backend/dissemination/migrations/0023_additionalein_hash_additionaluei_hash_captext_hash_and_more.py b/backend/dissemination/migrations/0023_additionalein_hash_additionaluei_hash_captext_hash_and_more.py new file mode 100644 index 0000000000..f98ff246ef --- /dev/null +++ b/backend/dissemination/migrations/0023_additionalein_hash_additionaluei_hash_captext_hash_and_more.py @@ -0,0 +1,90 @@ +# Generated by Django 5.2.3 on 2025-09-12 11:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("dissemination", "0022_resubmission"), + ] + + operations = [ + migrations.AddField( + model_name="additionalein", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="additionaluei", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="captext", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="federalaward", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="finding", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="findingtext", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="general", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="note", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="passthrough", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="resubmission", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + migrations.AddField( + model_name="secondaryauditor", + name="hash", + field=models.CharField( + blank=True, help_text="A hash of the row", null=True + ), + ), + ] diff --git a/backend/dissemination/models/additionalein.py b/backend/dissemination/models/additionalein.py index dd6ec7231f..b6a46277a3 100644 --- a/backend/dissemination/models/additionalein.py +++ b/backend/dissemination/models/additionalein.py @@ -5,6 +5,8 @@ class AdditionalEin(models.Model): """Additional EINs for this audit.""" + HASH_FIELDS = ["report_id", "additional_ein"] + report_id = models.ForeignKey( "General", help_text=REPORT_ID_FK_HELP_TEXT, @@ -13,3 +15,8 @@ class AdditionalEin(models.Model): db_column="report_id", ) additional_ein = models.TextField() + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/additionaluei.py b/backend/dissemination/models/additionaluei.py index f873ed5378..a429725984 100644 --- a/backend/dissemination/models/additionaluei.py +++ b/backend/dissemination/models/additionaluei.py @@ -5,6 +5,8 @@ class AdditionalUei(models.Model): """Additional UEIs for this audit.""" + HASH_FIELDS = ["report_id", "additional_uei"] + report_id = models.ForeignKey( "General", help_text=REPORT_ID_FK_HELP_TEXT, @@ -13,3 +15,8 @@ class AdditionalUei(models.Model): db_column="report_id", ) additional_uei = models.TextField() + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/captext.py b/backend/dissemination/models/captext.py index d20bfa936b..aa51faf952 100644 --- a/backend/dissemination/models/captext.py +++ b/backend/dissemination/models/captext.py @@ -6,6 +6,13 @@ class CapText(models.Model): """Corrective action plan text. Referebces General""" + HASH_FIELDS = [ + "report_id", + "finding_ref_number", + "planned_action", + "contains_chart_or_table", + ] + contains_chart_or_table = models.TextField( "Indicates whether or not the text contained charts or tables that could not be entered due to formatting restrictions", help_text=docs.charts_tables_captext, @@ -25,3 +32,8 @@ class CapText(models.Model): to_field="report_id", db_column="report_id", ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/federalaward.py b/backend/dissemination/models/federalaward.py index 2b75a8880a..f7135b90a4 100644 --- a/backend/dissemination/models/federalaward.py +++ b/backend/dissemination/models/federalaward.py @@ -19,6 +19,29 @@ class FederalAward(models.Model): # "findings_count" # ]), # ] + HASH_FIELDS = [ + "report_id", + "award_reference", + "federal_agency_prefix", + "federal_award_extension", + "aln", + "findings_count", + "additional_award_identification", + "federal_program_name", + "amount_expended", + "federal_program_total", + "cluster_name", + "state_cluster_name", + "other_cluster_name", + "cluster_total", + "is_direct", + "is_passthrough_award", + "passthrough_amount", + "is_major", + "audit_report_type", + "is_loan", + "loan_balance", + ] additional_award_identification = models.TextField( "Other data used to identify the award which is not a CFDA number (e.g., program year, contract number)", @@ -101,3 +124,8 @@ class FederalAward(models.Model): to_field="report_id", db_column="report_id", ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/finding.py b/backend/dissemination/models/finding.py index 82260889fb..3ab9b79853 100644 --- a/backend/dissemination/models/finding.py +++ b/backend/dissemination/models/finding.py @@ -6,6 +6,24 @@ class Finding(models.Model): """A finding from the audit. References FederalAward and FindingText""" + HASH_FIELDS = [ + "report_id", + "federal_agency_prefix", + "federal_award_extension", + "aln", + "award_reference", + "reference_number", + "type_requirement", + "is_modified_opinion", + "is_other_findings", + "is_material_weakness", + "is_significant_deficiency", + "is_other_matters", + "is_questioned_costs", + "is_repeat_finding", + "prior_finding_ref_numbers", + ] + award_reference = models.TextField( "Order that the award line was reported in Award", ) @@ -53,3 +71,8 @@ class Finding(models.Model): "Type Requirement Failure", help_text=docs.type_requirement_findings, ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/findingtext.py b/backend/dissemination/models/findingtext.py index 183b41d966..f1e0334f12 100644 --- a/backend/dissemination/models/findingtext.py +++ b/backend/dissemination/models/findingtext.py @@ -6,6 +6,14 @@ class FindingText(models.Model): """Specific findings details. References General""" + HASH_FIELDS = [ + "id", + "report_id", + "finding_ref_number", + "contains_chart_or_table", + "finding_text", + ] + report_id = models.ForeignKey( "General", help_text=REPORT_ID_FK_HELP_TEXT, @@ -25,3 +33,8 @@ class FindingText(models.Model): "Content of the finding text", help_text=docs.text_findingstext, ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/general.py b/backend/dissemination/models/general.py index d5efcd71ae..2b79153701 100644 --- a/backend/dissemination/models/general.py +++ b/backend/dissemination/models/general.py @@ -16,6 +16,75 @@ class General(models.Model): # ] + # NOTE: This list was pulled from the `summary_reports.py` file. It is the list of fields we + # export as part of the SF-SAC XLSX download. + HASH_FIELDS = [ + "agencies_with_prior_findings", + "audit_period_covered", + "audit_type", + "audit_year", + "auditee_address_line_1", + "auditee_certified_date", + "auditee_certify_name", + "auditee_certify_title", + "auditee_city", + "auditee_contact_name", + "auditee_contact_title", + "auditee_ein", + "auditee_email", + "auditee_name", + "auditee_phone", + "auditee_state", + "auditee_uei", + "auditee_zip", + "auditor_address_line_1", + "auditor_certified_date", + "auditor_certify_name", + "auditor_certify_title", + "auditor_city", + "auditor_contact_name", + "auditor_contact_title", + "auditor_country", + "auditor_ein", + "auditor_email", + "auditor_firm_name", + "auditor_foreign_address", + "auditor_phone", + "auditor_state", + "auditor_zip", + "cognizant_agency", + "data_source", + "dollar_threshold", + "entity_type", + "fac_accepted_date", + "fy_end_date", + "fy_start_date", + "gaap_results", + "is_additional_ueis", + "is_aicpa_audit_guide_included", + "is_going_concern_included", + "is_internal_control_deficiency_disclosed", + "is_internal_control_material_weakness_disclosed", + "is_low_risk_auditee", + "is_material_noncompliance_disclosed", + "is_public", + "is_sp_framework_required", + "number_months", + "oversight_agency", + "report_id", + "sp_framework_basis", + "sp_framework_opinions", + "total_amount_expended", + "type_audit_code", + # 20250912 MCJ: Because of the off-by-one issues in timezones, it might be best + # to leave these out of the hash until those issues are resolved. Or, figure out what is + # going on that the hashing is happening before the timezone issue, because (somehow) + # the hash is being computed before the data changes and hits the dissem tables. + # "date_created", + # "ready_for_certification_date", + # "submitted_date", + ] + report_id = models.TextField( "Report ID", help_text=REPORT_ID_FK_HELP_TEXT, @@ -256,6 +325,11 @@ class General(models.Model): default=0, help_text="Version counter of how many times this SAC was resubmitted.", ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) class Meta: unique_together = (("report_id",),) diff --git a/backend/dissemination/models/note.py b/backend/dissemination/models/note.py index 84830ec087..22a8ec0a4b 100644 --- a/backend/dissemination/models/note.py +++ b/backend/dissemination/models/note.py @@ -6,6 +6,17 @@ class Note(models.Model): """Note to Schedule of Expenditures of Federal Awards (SEFA)""" + HASH_FIELDS = [ + "id", + "report_id", + "note_title", + "accounting_policies", + "rate_explained", + "is_minimis_rate_used", + "content", + "contains_chart_or_table", + ] + accounting_policies = models.TextField( "A description of the significant accounting policies used in preparing the SEFA (2 CFR 200.510(b)(6))", ) @@ -24,3 +35,8 @@ class Note(models.Model): "Indicates whether or not the text contained charts or tables that could not be entered due to formatting restrictions", help_text=docs.charts_tables_note, ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/passthrough.py b/backend/dissemination/models/passthrough.py index 306bb596b6..e1ccdef729 100644 --- a/backend/dissemination/models/passthrough.py +++ b/backend/dissemination/models/passthrough.py @@ -6,6 +6,13 @@ class Passthrough(models.Model): """The pass-through entity information, when it is not a direct federal award""" + HASH_FIELDS = [ + "report_id", + "award_reference", + "passthrough_name", + "passthrough_id", + ] + award_reference = models.TextField( "Order that the award line was reported", ) @@ -24,3 +31,8 @@ class Passthrough(models.Model): "Name of Pass-through Entity", help_text=docs.passthrough_name, ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/models/resubmission.py b/backend/dissemination/models/resubmission.py index a95e2b53e7..f69e82ade5 100644 --- a/backend/dissemination/models/resubmission.py +++ b/backend/dissemination/models/resubmission.py @@ -12,6 +12,14 @@ class Resubmission(models.Model): and next versions, if they exist. No default values - all fields are assumed to be filled or NULL. """ + HASH_FIELDS = [ + "report_id", + "version", + "status", + "previous_report_id", + "next_report_id", + ] + # Foreign key links to all the other parts of the record. Unique in this table. report_id = models.ForeignKey( "General", @@ -44,10 +52,17 @@ class Resubmission(models.Model): # help_text=docs.next_report_id, # "The report_id of the next version. Points up the chain from a deprecated record." null=True, ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) # Eventually: # resubmission_justification. Either a TextField provided by the user, or a CharField with choices for predetermined justifications. # changed_fields. A TextField with a string of comma separated field names. i.e. "one_field, two_field, red_field, blue_field". def __str__(self): - return f"report_id:{self.report_id} Version:{self.resubmission_version}, Status:{self.resubmission_status}" + return ( + f"report_id:{self.report_id} Version:{self.version}, Status:{self.status}" + ) diff --git a/backend/dissemination/models/secondaryauditor.py b/backend/dissemination/models/secondaryauditor.py index cb20bf9986..2c74fb5017 100644 --- a/backend/dissemination/models/secondaryauditor.py +++ b/backend/dissemination/models/secondaryauditor.py @@ -4,6 +4,20 @@ class SecondaryAuditor(models.Model): + HASH_FIELDS = [ + "report_id", + "auditor_name", + "auditor_ein", + "address_street", + "address_city", + "address_state", + "address_zipcode", + "contact_name", + "contact_title", + "contact_email", + "contact_phone", + ] + address_city = models.TextField( "CPA City", help_text=docs.auditor_city, @@ -50,3 +64,8 @@ class SecondaryAuditor(models.Model): to_field="report_id", db_column="report_id", ) + hash = models.CharField( + help_text="A hash of the row", + blank=True, + null=True, + ) diff --git a/backend/dissemination/summary_reports.py b/backend/dissemination/summary_reports.py index dc49394e13..2c42d82f57 100644 --- a/backend/dissemination/summary_reports.py +++ b/backend/dissemination/summary_reports.py @@ -257,6 +257,7 @@ "contact_email", "contact_phone", ], + # "resubmission": ["report_id", "version", "status", "previous_report_id", "next_report_id"], } restricted_model_names = ["captext", "findingtext", "note"] diff --git a/backend/util/hash_compare/compare.py b/backend/util/hash_compare/compare.py new file mode 100644 index 0000000000..d410731c90 --- /dev/null +++ b/backend/util/hash_compare/compare.py @@ -0,0 +1,284 @@ +from requests import get +import os +from hashlib import sha1 +import logging +from datetime import datetime +import sys + +logger = logging.getLogger(__name__) +logging.basicConfig(encoding="utf-8", level=logging.INFO) + +field_name_ordered = { + "general": [ + "agencies_with_prior_findings", + "audit_period_covered", + "audit_type", + "audit_year", + "auditee_address_line_1", + "auditee_certified_date", + "auditee_certify_name", + "auditee_certify_title", + "auditee_city", + "auditee_contact_name", + "auditee_contact_title", + "auditee_ein", + "auditee_email", + "auditee_name", + "auditee_phone", + "auditee_state", + "auditee_uei", + "auditee_zip", + "auditor_address_line_1", + "auditor_certified_date", + "auditor_certify_name", + "auditor_certify_title", + "auditor_city", + "auditor_contact_name", + "auditor_contact_title", + "auditor_country", + "auditor_ein", + "auditor_email", + "auditor_firm_name", + "auditor_foreign_address", + "auditor_phone", + "auditor_state", + "auditor_zip", + "cognizant_agency", + "data_source", + "dollar_threshold", + "entity_type", + "fac_accepted_date", + "fy_end_date", + "fy_start_date", + "gaap_results", + "is_additional_ueis", + "is_aicpa_audit_guide_included", + "is_going_concern_included", + "is_internal_control_deficiency_disclosed", + "is_internal_control_material_weakness_disclosed", + "is_low_risk_auditee", + "is_material_noncompliance_disclosed", + "is_public", + "is_sp_framework_required", + "number_months", + "oversight_agency", + "report_id", + "sp_framework_basis", + "sp_framework_opinions", + "total_amount_expended", + "type_audit_code", + # 20250912 MCJ: Because of the off-by-one issues in timezones, it might be best + # to leave these out of the hash until those issues are resolved. Or, figure out what is + # going on that the hashing is happening before the timezone issue, because (somehow) + # the hash is being computed before the data changes and hits the dissem tables. + # "date_created", + # "ready_for_certification_date", + # "submitted_date", + ], + "federal_awards": [ + "report_id", + "award_reference", + "federal_agency_prefix", + "federal_award_extension", + "aln", + "findings_count", + "additional_award_identification", + "federal_program_name", + "amount_expended", + "federal_program_total", + "cluster_name", + "state_cluster_name", + "other_cluster_name", + "cluster_total", + "is_direct", + "is_passthrough_award", + "passthrough_amount", + "is_major", + "audit_report_type", + "is_loan", + "loan_balance", + ], + "findings": [ + "report_id", + "federal_agency_prefix", + "federal_award_extension", + "aln", + "award_reference", + "reference_number", + "type_requirement", + "is_modified_opinion", + "is_other_findings", + "is_material_weakness", + "is_significant_deficiency", + "is_other_matters", + "is_questioned_costs", + "is_repeat_finding", + "prior_finding_ref_numbers", + ], + "findings_text": [ + "id", + "report_id", + "finding_ref_number", + "contains_chart_or_table", + "finding_text", + ], + "notes_to_sefa": [ + "id", + "report_id", + "note_title", + "accounting_policies", + "rate_explained", + "is_minimis_rate_used", + "content", + "contains_chart_or_table", + ], + "corrective_action_plans": [ + "report_id", + "finding_ref_number", + "planned_action", + "contains_chart_or_table", + ], + "additional_eins": ["report_id", "additional_ein"], + "additional_ueis": ["report_id", "additional_uei"], + "passthrough": [ + "report_id", + "award_reference", + "passthrough_name", + "passthrough_id", + ], + "secondary_auditors": [ + "report_id", + "auditor_name", + "auditor_ein", + "address_street", + "address_city", + "address_state", + "address_zipcode", + "contact_name", + "contact_title", + "contact_email", + "contact_phone", + ], +} + +from dateutil.parser import parse + + +def is_date(string, fuzzy=False): + """ + Return whether the string can be interpreted as a date. + + :param string: str, string to check for date + :param fuzzy: bool, ignore unknown tokens in string if True + """ + try: + parse(string, fuzzy=fuzzy) + return True + except Exception: + return False + + +def convert_to_string(o): + if isinstance(o, str) and len(o) == 10 and is_date(o): + return f"{o}" + if o is None: + return "" + else: + return f"{o}" + + +# Ideally, this code would be *identical* to the code used inside of the FAC. +# It should be published, at first as part of an ADR. +def hash_dissemination_object(endpoint, d): + # Given a hash, alpha sort the keys. We do this by taking + # the object to a list of tuples, and then sorting + # the resulting list on the first element of the tuple. + # + # See https://stackoverflow.com/a/22003440 + # for reference. It isn't obvious how to do this well, and in particular, + # while leaving the JSON object keys out of the hash. + + # -1. Get the fields we're going to hash from the object + fields_to_hash = field_name_ordered[endpoint] + + # 1. Dictionary to tuples + tupes = list(d.items()) + # 2. Tuples sorted by key + sorted_tupes = sorted(tupes, key=lambda k: k[0]) + # 2b. Get rid of fields that we're not hashing + filtered_sorted = list(filter(lambda t: t[0] in fields_to_hash, sorted_tupes)) + # logger.info(filtered_sorted) + # logger.info(list(map(lambda p: p[0], filtered_sorted))) + + # 3. Strip the keys + # Why strip the keys? We don't want our field names to impact + # the hashing value. We want to make sure the values in the object, in a consistent sort + # order, are what get hashed. If we change field names, yes, the hash will change. But + # our object field names are very consistent. + # It is unclear if we're going to get consistent, cross-language hashing here. + # It depends on how Python chooses to reprseent values as strings. If we don't quite get this right + # the first time, it will have to be improved, and the full dataset re-disseminated. + # p[0] is the key, p[1] is the value in the tuple list. + # Strings must be encoded to bytes before hashing. + just_values = list(map(lambda p: convert_to_string(p[1]), filtered_sorted)) + # 4. Append the values with no spaces. + # logger.info(f"just_values: {just_values}") + smooshed = "".join(just_values).strip().encode("ascii", "ignore") + # logger.info(f"smooshed: {smooshed}") + # This is now hashable. Run a SHA1. + shaobj = sha1() + shaobj.update(smooshed) + digest = shaobj.hexdigest() + # logger.info(f"[SHA] {digest}") + + return (digest, smooshed) + + +import time + + +def main(): + headers = { + # THE MAGIC AUTH BEARER STRING... Use the cypress local testing values here. + "authorization": "Bearer " + os.getenv("CYPRESS_API_GOV_JWT"), + "x-api-key": os.getenv("CYPRESS_API_GOV_USER_ID"), + "accept_profile": "api_v1_1_0", + } + + if len(sys.argv) > 1: + reports = [sys.argv[1]] + print(f"Checking report: {reports}") + else: + url = f"http://localhost:3000/general?hash=neq.NOHASH&hash=not.is.null" + res = get(url, headers=headers) + reports = [o["report_id"] for o in res.json()] + + for report_id in reports: + # logger.info(f"Checking {report_id}") + for ep in ["general", "federal_awards"]: + url = f"http://localhost:3000/{ep}?report_id=eq.{report_id}" + # logger.info(f"Query: {url}") + res = get(url, headers=headers) + objs = res.json() + # logger.info(f"Found {len(objs)} objects for {ep}:{report_id}") + for d in objs: + # Save the existing hash + current_hash = d["hash"] + (computed_hash, smooshed) = hash_dissemination_object(ep, d) + rid = d["report_id"] + if current_hash == computed_hash: + # print(f"SAME {rid}") + pass + else: + print( + f"DIFFERENT {ep} {rid} dissem {current_hash} computed {computed_hash}" + ) + logger.info(f"{d['report_id']}: {computed_hash} {smooshed}") + print(smooshed) + + +if __name__ in "__main__": + main() + +# b'66,47,15,11annualsingle-audit2016P.O. BOX 5752017-09-29SOUTHWEST WETLANDS INTERPRETIVE ASSOCIATIONADMINISTRATIVE OFFICERIMPERIAL BEACHDEBRA CAREYADMINISTRATIVE OFFICER953488027SWIA_DCAREY@ATT.NETSOUTHWEST WETLANDS INTERPRETIVE ASSOCIATION6195750550CAGSA_MIGRATION919332170 SOUTH EL CAMINO REAL, STE. 2132017-09-29ROLLIE MUNGERPRESIDENTOCEANSIDEROLLIE MUNGERPRESIDENTUSA473342732ROLLIE@ROLLIEMUNGERCPA.COMMUNGER & COMPANY, CPAS7607308020CA92054CENSUS2024-06-22750000non-profit2017-09-282016-12-312016-01-01unmodified_opinionNoYesNoNoNoYesNoTrue112017-09-292016-12-CENSUS-00002337332017-09-28895472UG' +# b'66,47,15,11annualsingle-audit2016P.O. BOX 5752017-09-29SOUTHWEST WETLANDS INTERPRETIVE ASSOCIATIONADMINISTRATIVE OFFICERIMPERIAL BEACHDEBRA CAREYADMINISTRATIVE OFFICER953488027SWIA_DCAREY@ATT.NETSOUTHWEST WETLANDS INTERPRETIVE ASSOCIATION6195750550CAGSA_MIGRATION919332170 SOUTH EL CAMINO REAL, STE. 2132017-09-29ROLLIE MUNGERPRESIDENTOCEANSIDEROLLIE MUNGERPRESIDENTUSA473342732ROLLIE@ROLLIEMUNGERCPA.COMMUNGER & COMPANY, CPAS7607308020CA92054CENSUS2024-06-22750000non-profit2017-09-282016-12-312016-01-01unmodified_opinionNoYesNoNoNoYesNoTrue112017-09-282016-12-CENSUS-00002337332017-09-28895472UG' diff --git a/docs/architecture/decisions/0046-signing-data.md b/docs/architecture/decisions/0046-signing-data.md new file mode 100644 index 0000000000..8af2a81c3a --- /dev/null +++ b/docs/architecture/decisions/0046-signing-data.md @@ -0,0 +1,99 @@ +# 46. Digitally signing data + +Date: 2025-09-12 + +## Status + +Accepted + +## Areas of impact + +* Compliance +* Engineering +* Policy + +## Related resources + +* https://en.wikipedia.org/wiki/Hash_function +* https://www.codecademy.com/resources/blog/what-is-hashing + +## Context + +The FAC holds nearly $9T in audit records and financial history of federal spending. However, we do not sign, or hash, any of this data. While we trust ourselves and our security processes (e.g. NIST controls, encryption at rest, etc.), this is still a problem. + +Although it is difficult to engineer a system that is robust against changes made by owners of the system, the FAC has the benefit of all of its data being public. Therefore, if we cryptographically sign our data, we are asserting (mathematically) the state of the data at a single point in time. By publishing those signatures, others can verify that the data they have matches what it was at some point in the past. And, if the data ever chagnes, one of two things will be true: + +1. The data changes, and the signature does not. This means the data was tampered with or changed without a new signature being calculated. It could be an honest mistake, or it could be a sign that data in the FAC was tampered with. +2. The data changes, and the signature changes. This could be because the data legitimately changed, and the signature had to be recalculated. Or, it could have been tampered with, and the signature updated to make the change look legitimate. + +In the case of #1, consumers of the data are able to compare the data they downloaded in the past with the new data, and determine exactly what changed. The fact that the signatures did *not* change serves as a red flag, and encourages them to look closely at the contents of (say) a row of data or a PDF report. + +In the case of #2, the FAC can easily post updates pubilcly when all of the signatures are going to change. For example, if a new field is added to the data, the signatures will change. In this example, it should be the case that the *prior* fields would still hash to the same value, meaning that consumers of the data could determine that nothing in their prior downloads changed. They could, in a word, trust the new signatures. + +However, it could also be that (in the case of #2), there were changes to the data such that consumers of our data see things that no longer align with the past record. In this case, they have a path to enquire. There are many reasons this might happen; for example, we know we have "off-by-one" errors in some of our dates (see fac.gov/data). When this is fixed, those signatures will change. But regardless of the reason, the change of the signatures are a clear indicator that *something* of import has changed. + +Fundamentally, this is basic data security and integrity work. Further, it provides a way for downstream users of the data to quickly and reliably assess the correctness of their own data downloads. It is a light lift, engineering-wise, with powerful and positive impacts to the data integrity and quality in the FAC. + +## Decision + +1. Add row-level hasing to all internal and external tables in the FAC. + 1. Specifically, to `SingleAuditChecklist` and `SingleAuditReportFile` internally, and all `dissemination_` tables externally. +2. Add a hash of the Single Audit Report PDF table (internally). Include that hash in the `general` table in dissemination. + +The algorithm used is (as of this draft): + +1. Take a list of fields to compute over/hash from the data object (e.g. a `General` object). +2. We convert the object to a dictionary. +4. Convert the dictionary (of the shape `{"key": "value"}`) to a list of tuples of the shape `[("key", "value")]`. +5. Sort the list of tuples alphabetically by the key (the 0th element of the tuples). +6. Filter the list of tuples so we only keep values associated with hashable fields. +7. Get rid of the field names, so we have a list of only the values. +8. Combine all values into a single string without adding spaces. Any existing spaces in the data are left as-is. Strip any whitespace from the beginning and end of the line (including newlines, etc.). Convert that to a UTF-8-encoded byte array. +9. Run a SHA1 on the resulting byte array. +10. Return the hex digest of the SHA1. + +``` +def hash_dissemination_object(obj): + # Given a hash, alpha sort the keys. We do this by taking + # the object to a list of tuples, and then sorting + # the resulting list on the first element of the tuple. + # + # See https://stackoverflow.com/a/22003440 + # for reference. It isn't obvious how to do this well, and in particular, + # while leaving the JSON object keys out of the hash. + + # 1. Get the fields we're going to hash from the object + fields_to_hash = obj.HASH_FIELDS + # 2. We are given a Django object. Convert it to a dictionary. + d = model_to_dict(obj) + # 3. Dictionary to tuples + tupes = list(d.items()) + # 4. Tuples sorted by key + sorted_tupes = sorted(tupes, key=lambda k: k[0]) + # 5. Get rid of fields that we're not hashing + filtered_sorted = list(filter(lambda t: t[0] in fields_to_hash, sorted_tupes)) + # 6. Strip the keys + # Why strip the keys? We don't want our field names to impact + # the hashing value. We want to make sure the values in the object, in a consistent sort + # order, are what get hashed. If we change field names, yes, the hash will change. But + # our object field names are very consistent. + # It is unclear if we're going to get consistent, cross-language hashing here. + # It depends on how Python chooses to reprseent values as strings. If we don't quite get this right + # the first time, it will have to be improved, and the full dataset re-disseminated. + # p[0] is the key, p[1] is the value in the tuple list. + # Strings must be encoded to bytes before hashing. + just_values = list(map(lambda p: convert_to_string(p[1]), filtered_sorted)) + # 7. Append the values with no spaces. + smooshed = "".join(just_values).strip().encode("utf-8") + # This is now hashable. Run a SHA1. + shaobj = sha1() + shaobj.update(smooshed) + digest = shaobj.hexdigest() + return digest +``` + +as found in `intake_to_dissemination.py`. + +## Consequences + +The consequences are all net-positive. We will be able to write more validations that assert the consistency of our data, and our users will similarly have more confidence, over time, in the data we publish.