scripts-traitement-sircom/12-verify_data_integrity.py at main · Alexmacapple/scripts-traitement-sircom · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Script de vérification de l'intégrité des données
Compare le fichier Excel source avec le CSV final
pour s'assurer qu'aucune ligne n'a été décalée
"""

import pandas as pd
import sys

def verify_data_integrity():
    print('=== VÉRIFICATION DE LA COHÉRENCE DES DONNÉES ===')
    print()

    try:
        # 1. Lire le fichier Excel source
        excel_file = 'Sircom.xlsx'
        df_excel = pd.read_excel(excel_file)
        print(f'✅ Excel source lu : {len(df_excel)} lignes (hors en-tête)')

        # 2. Lire le CSV final
        csv_file = '9-final-sircom-indesign-utf16.csv'
        df_csv = pd.read_csv(csv_file, encoding='utf-16')
        print(f'✅ CSV final lu : {len(df_csv)} lignes (hors en-tête)')
        print()

        # 3. Extraire les données pour comparaison
        # Colonnes Excel - Nouvelle structure 25 colonnes (A-Y)
        col_id_excel = df_excel.columns[5]  # Colonne F = ID
        col_produit_excel = df_excel.columns[6]  # Colonne G = Produit
        col_entreprise_excel = df_excel.columns[7]  # Colonne H = Entreprise
        col_image_excel = df_excel.columns[24]  # Colonne Y = Photo

        # Créer un dictionnaire des données Excel
        excel_data = {}
        for idx, row in df_excel.iterrows():
            id_val = row[col_id_excel]
            if pd.notna(id_val) and str(id_val) not in ['#N/A', '']:
                # Gérer les IDs numériques et alphanumériques
                try:
                    # Si c'est un nombre float, convertir en int
                    if isinstance(id_val, (int, float)):
                        id_str = str(int(id_val))
                    else:
                        # Si c'est une chaîne, garder tel quel
                        id_str = str(id_val)
                except:
                    id_str = str(id_val)

                excel_data[id_str] = {
                    'entreprise': str(row[col_entreprise_excel]) if pd.notna(row[col_entreprise_excel]) else '#N/A',
                    'produit': str(row[col_produit_excel]) if pd.notna(row[col_produit_excel]) else '#N/A',
                    'image_source': str(row[col_image_excel]) if pd.notna(row[col_image_excel]) else '#N/A',
                    'position_excel': idx + 1
                }

        print('📊 TABLEAU DE VÉRIFICATION DES CORRESPONDANCES :')
        print('='*120)
        print(f"{'Pos CSV':<8} {'ID':<10} {'Image ID':<25} {'Entreprise':<30} {'Produit':<30} {'Status':<15}")
        print('-'*120)

        # Vérifier chaque ligne du CSV
        errors = []
        warnings = []

        for idx, row in df_csv.iterrows():
            csv_id = str(row['f_id'])
            csv_image = str(row['imageid'])
            csv_entreprise = str(row['h_entrepri']) if 'h_entrepri' in row else '#N/A'
            csv_produit = str(row['g_denomina']) if 'g_denomina' in row else '#N/A'

            # Tronquer pour l'affichage
            entreprise_display = csv_entreprise[:27] + '...' if len(csv_entreprise) > 30 else csv_entreprise
            produit_display = csv_produit[:27] + '...' if len(csv_produit) > 30 else csv_produit

            if csv_id in excel_data:
                excel_info = excel_data[csv_id]

                # Vérifier la cohérence image_id (avec normalisation)
                # Les IDs peuvent être alphanumériques, normaliser en minuscules
                normalized_id = csv_id.lower().replace('.', '').replace(' ', '')
                expected_image = f"dossier-{normalized_id}.jpg"
                if csv_image != expected_image:
                    status = '❌ IMG ERR'
                    errors.append(f"Ligne {idx+2}: Image attendue {expected_image}, trouvée {csv_image}")
                # Vérifier si trié (position changée)
                elif excel_info['position_excel'] != idx + 1:
                    status = '🔄 TRI OK'
                    warnings.append(f"ID {csv_id} déplacé de ligne {excel_info['position_excel']} à {idx+2}")
                else:
                    status = '✅ OK'

                print(f"{idx+2:<8} {csv_id:<10} {csv_image:<25} {entreprise_display:<30} {produit_display:<30} {status}")
            else:
                print(f"{idx+2:<8} {csv_id:<10} {csv_image:<25} {entreprise_display:<30} {produit_display:<30} ❌ ID ABSENT")
                errors.append(f"Ligne {idx+2}: ID {csv_id} non trouvé dans Excel source")

        print()
        print('='*120)
        print('📈 RÉSUMÉ DE LA VÉRIFICATION :')
        print()

        # Statistiques
        print(f"📊 Statistiques :")
        print(f"  - Lignes dans Excel source : {len(excel_data)}")
        print(f"  - Lignes dans CSV final : {len(df_csv)}")
        print(f"  - Différence : {len(excel_data) - len(df_csv)} lignes")
        print()

        # IDs manquants dans le CSV
        csv_ids = set(str(row['f_id']) for _, row in df_csv.iterrows())
        missing_ids = set(excel_data.keys()) - csv_ids
        if missing_ids:
            print(f"⚠️  IDs présents dans Excel mais absents du CSV final :")
            for mid in sorted(missing_ids):
                print(f"    - ID {mid}: {excel_data[mid]['entreprise'][:50]}")

        print()

        # Résultat final
        if errors:
            print(f"❌ {len(errors)} ERREURS DÉTECTÉES :")
            for err in errors[:5]:  # Afficher max 5 erreurs
                print(f"    {err}")
            if len(errors) > 5:
                print(f"    ... et {len(errors)-5} autres erreurs")
        else:
            print("✅ AUCUNE ERREUR DÉTECTÉE")

        if warnings:
            print(f"\n🔄 {len(warnings)} lignes ont été réorganisées (tri par région/département)")
            # Afficher quelques exemples
            for warn in warnings[:3]:
                print(f"    {warn}")
            if len(warnings) > 3:
                print(f"    ... et {len(warnings)-3} autres déplacements")

        print()

        # Vérification spécifique des images
        print("🖼️  VÉRIFICATION DES ASSOCIATIONS IMAGE/DOSSIER :")
        print()

        # Quelques exemples pour valider avec les nouveaux IDs alphanumériques
        verif_samples = [
            ('ara072025-hgv', 'Photo produit HGV', 'HGV'),
            ('ara432025-rochefreres', 'Photo produit Roche Frères', 'Roche Frères'),
            ('corse2a2025-neutralvision', 'Photo produit Neutral Vision', 'Neutral Vision'),
        ]

        for check_id, expected_source, description in verif_samples:
            if check_id in excel_data:
                csv_row = df_csv[df_csv['f_id'] == check_id]
                if not csv_row.empty:
                    csv_image = csv_row.iloc[0]['imageid']
                    print(f"  ID {check_id} ({description}):")
                    print(f"    - Image source Excel : {excel_data[check_id]['image_source'][:50]}")
                    print(f"    - Image CSV : {csv_image}")
                    print(f"    - ✅ Correct" if csv_image == f"dossier-{check_id}.jpg" else f"    - ❌ ERREUR")

        print()
        print('='*120)

        if not errors:
            print("🎉 VALIDATION RÉUSSIE - Le CSV est cohérent avec l'Excel source !")
            print("   Les données ont été triées par région mais les associations ID/données sont correctes.")
        else:
            print("⚠️  ATTENTION - Des incohérences ont été détectées !")

    except Exception as e:
        print(f"❌ Erreur lors de la vérification : {e}")
        return False

    return len(errors) == 0

if __name__ == "__main__":
    success = verify_data_integrity()
    sys.exit(0 if success else 1)