From 013083176e2796988907ec364e7f713454a74eb6 Mon Sep 17 00:00:00 2001 From: 951095 Date: Tue, 23 Sep 2025 09:39:48 +0200 Subject: [PATCH] script de netoyage des csv et securisation des variables d'environnement --- docker-compose.dev.yml | 9 +++++---- docker-compose.yml | 10 +++++----- makefile | 5 ++--- scripts/clean_csv.py | 23 ++++++++++++----------- verify.log | 40 ++++++++++++++++++++-------------------- 5 files changed, 44 insertions(+), 43 deletions(-) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 3f5b8ff..e32d46b 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -9,13 +9,14 @@ services: restart: unless-stopped environment: POSTGRES_USER: ${POSTGRES_USER:-admin} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin123} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} POSTGRES_DB: ${POSTGRES_DB:-ptitpas_db} ports: - "5433:5432" volumes: - # Scripts de migration (ordre important → sync_enums, init, indexes, checks, triggers, import…) - - ./migrations/00_sync_enums.sql:/docker-entrypoint-initdb.d/00_sync_enums.sql + # Scripts de migration (init first). NOTE: `00_sync_enums.sql` is intentionally + # NOT mounted here because it must be executed after `01_init.sql` (it alters + # existing enum types). The local pipeline applies it explicitly after init. - ./migrations/01_init.sql:/docker-entrypoint-initdb.d/01_init.sql - ./migrations/02_indexes.sql:/docker-entrypoint-initdb.d/02_indexes.sql - ./migrations/03_checks.sql:/docker-entrypoint-initdb.d/03_checks.sql @@ -39,7 +40,7 @@ services: restart: unless-stopped environment: PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL:-admin@ptits-pas.fr} - PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD:-admin123} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} ports: - "8081:80" depends_on: diff --git a/docker-compose.yml b/docker-compose.yml index d6babdc..71e7ebc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,9 +4,9 @@ services: container_name: ynov-postgres restart: unless-stopped environment: - POSTGRES_USER: admin - POSTGRES_PASSWORD: admin123 - POSTGRES_DB: ptitpas_db + POSTGRES_USER: ${POSTGRES_USER:-admin} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB:-ptitpas_db} volumes: - ./migrations/01_init.sql:/docker-entrypoint-initdb.d/01_init.sql - postgres_data:/var/lib/postgresql/data @@ -18,8 +18,8 @@ services: container_name: ynov-pgadmin restart: unless-stopped environment: - PGADMIN_DEFAULT_EMAIL: admin@ptits-pas.fr - PGADMIN_DEFAULT_PASSWORD: admin123 + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL:-admin@ptits-pas.fr} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} depends_on: - db labels: diff --git a/makefile b/makefile index 0fe435d..447b9f3 100644 --- a/makefile +++ b/makefile @@ -75,12 +75,11 @@ logs: psql: docker exec -it $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -stop: - docker compose -f docker-compose.dev.yml down stop: docker compose -f docker-compose.dev.yml down sync-enums: @echo "🔁 Génération de migrations/00_sync_enums.sql depuis les CSV" @python3 scripts/sync_enums.py > migrations/00_sync_enums.sql - @echo "✅ migrations/00_sync_enums.sql générée. Relis le fichier avant de l'appliquer." \ No newline at end of file + @echo "✅ migrations/00_sync_enums.sql générée. Relis le fichier avant de l'appliquer." + diff --git a/scripts/clean_csv.py b/scripts/clean_csv.py index 9f1b01a..aa5e30f 100755 --- a/scripts/clean_csv.py +++ b/scripts/clean_csv.py @@ -2,10 +2,11 @@ """scripts/clean_csv.py Usage: - python3 scripts/clean_csv.py [input_dir] [--out-dir OUT] [--dry-run] + python3 scripts/clean_csv.py [input_dir] [--out-dir OUT] [--dry-run] -This script cleans CSV files (trim, remove 'NULL', fix column count) and -performs simple validations (UUID-looking columns, date columns). +Ce script nettoie les fichiers CSV (trim, suppression de 'NULL', correction du +nombre de colonnes) et réalise des validations simples (colonnes ressemblant +à des UUID, colonnes date). """ import csv import sys @@ -46,7 +47,7 @@ def clean_csv_file(file_path: Path, out_path: Path = None, dry_run: bool = False nb_cols = len(header) cleaned_rows.append([h.strip() for h in header]) - # Heuristics for validations + # Heuristiques pour les validations uuid_cols = [i for i, h in enumerate(header) if h.lower() == 'id' or h.lower().endswith('_id')] date_cols = [i for i, h in enumerate(header) if any(k in h.lower() for k in ('date', '_le', '_at'))] @@ -63,15 +64,15 @@ def clean_csv_file(file_path: Path, out_path: Path = None, dry_run: bool = False for ci in uuid_cols: if ci < len(row) and row[ci]: if not is_uuid(row[ci]): - errors.append(f"Line {i}: column {header[ci]} not UUID-like: {row[ci]!r}") + errors.append(f"Ligne {i} : colonne {header[ci]} ne ressemble pas à un UUID : {row[ci]!r}") for ci in date_cols: if ci < len(row) and row[ci]: if not looks_like_date(row[ci]): - errors.append(f"Line {i}: column {header[ci]} not ISO-like date: {row[ci]!r}") + errors.append(f"Ligne {i} : colonne {header[ci]} n'est pas une date ISO-like : {row[ci]!r}") cleaned_rows.append(row) - # Write output if not dry-run + # Écrire la sortie si non --dry-run if not dry_run: target = out_path if out_path else file_path with target.open('w', encoding='utf-8', newline='') as outfile: @@ -99,11 +100,11 @@ def main(): res = clean_csv_file(file_path, out_path, args.dry_run) results.append(res) - # Report + # Rapport any_errors = False for r in results: if 'error' in r: - print(f"[WARN] {r['file']}: {r['error']}") + print(f"[AVERT] {r['file']}: {r['error']}") if r.get('errors'): any_errors = True print(f"[ERR] {r['file']} ->") @@ -113,10 +114,10 @@ def main(): print(f"[OK] {r['file']}") if any_errors: - print('\nSome files have validation issues. Fix them or run with --dry-run to inspect.') + print('\nCertains fichiers présentent des problèmes de validation. Corrigez-les ou lancez avec --dry-run pour inspecter.') sys.exit(2) else: - print('\nAll files cleaned successfully.') + print('\nTous les fichiers ont été nettoyés avec succès.') if __name__ == '__main__': diff --git a/verify.log b/verify.log index f859211..d2a220e 100644 --- a/verify.log +++ b/verify.log @@ -6,7 +6,7 @@ executed_at ------------------------------- - 2025-09-19 09:15:01.473058+00 + 2025-09-22 08:30:37.786643+00 (1 row) === 1) Comptes & répartition par rôle ========================== @@ -126,49 +126,49 @@ === 13) Performance : EXPLAIN sur requêtes clés =============== QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------- - Limit (cost=11.31..11.31 rows=3 width=89) (actual time=0.035..0.036 rows=0 loops=1) - -> Sort (cost=11.31..11.31 rows=3 width=89) (actual time=0.032..0.033 rows=0 loops=1) + Limit (cost=11.31..11.31 rows=3 width=89) (actual time=0.020..0.021 rows=0 loops=1) + -> Sort (cost=11.31..11.31 rows=3 width=89) (actual time=0.019..0.019 rows=0 loops=1) Sort Key: cree_le DESC Sort Method: quicksort Memory: 25kB - -> Bitmap Heap Scan on messages m (cost=4.17..11.28 rows=3 width=89) (actual time=0.023..0.024 rows=0 loops=1) + -> Bitmap Heap Scan on messages m (cost=4.17..11.28 rows=3 width=89) (actual time=0.013..0.014 rows=0 loops=1) Recheck Cond: (id_dossier = 'dddddddd-dddd-dddd-dddd-dddddddddddd'::uuid) - -> Bitmap Index Scan on idx_messages_id_dossier_cree_le (cost=0.00..4.17 rows=3 width=0) (actual time=0.012..0.013 rows=0 loops=1) + -> Bitmap Index Scan on idx_messages_id_dossier_cree_le (cost=0.00..4.17 rows=3 width=0) (actual time=0.006..0.007 rows=0 loops=1) Index Cond: (id_dossier = 'dddddddd-dddd-dddd-dddd-dddddddddddd'::uuid) - Planning Time: 1.468 ms - Execution Time: 0.521 ms + Planning Time: 0.106 ms + Execution Time: 0.105 ms (10 rows) QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------- - Index Scan using idx_evenements_id_enfant_date_debut on evenements ev (cost=0.15..8.17 rows=1 width=161) (actual time=0.006..0.007 rows=0 loops=1) + Index Scan using idx_evenements_id_enfant_date_debut on evenements ev (cost=0.15..8.17 rows=1 width=161) (actual time=0.021..0.021 rows=0 loops=1) Index Cond: ((id_enfant = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'::uuid) AND (date_debut >= '2025-01-01 00:00:00+00'::timestamp with time zone)) - Planning Time: 0.117 ms - Execution Time: 0.025 ms + Planning Time: 0.064 ms + Execution Time: 0.032 ms (4 rows) QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------ - Limit (cost=9.52..9.53 rows=2 width=73) (actual time=0.012..0.013 rows=0 loops=1) - -> Sort (cost=9.52..9.53 rows=2 width=73) (actual time=0.011..0.012 rows=0 loops=1) + Limit (cost=9.52..9.53 rows=2 width=73) (actual time=0.018..0.019 rows=0 loops=1) + -> Sort (cost=9.52..9.53 rows=2 width=73) (actual time=0.017..0.018 rows=0 loops=1) Sort Key: cree_le DESC Sort Method: quicksort Memory: 25kB - -> Bitmap Heap Scan on notifications n (cost=4.17..9.51 rows=2 width=73) (actual time=0.007..0.008 rows=0 loops=1) + -> Bitmap Heap Scan on notifications n (cost=4.17..9.51 rows=2 width=73) (actual time=0.014..0.014 rows=0 loops=1) Recheck Cond: ((id_utilisateur = '33333333-3333-3333-3333-333333333333'::uuid) AND (NOT lu)) - -> Bitmap Index Scan on idx_notifications_user_lu_cree_le (cost=0.00..4.17 rows=2 width=0) (actual time=0.004..0.005 rows=0 loops=1) + -> Bitmap Index Scan on idx_notifications_user_lu_cree_le (cost=0.00..4.17 rows=2 width=0) (actual time=0.012..0.012 rows=0 loops=1) Index Cond: ((id_utilisateur = '33333333-3333-3333-3333-333333333333'::uuid) AND (lu = false)) - Planning Time: 0.087 ms - Execution Time: 0.027 ms + Planning Time: 0.059 ms + Execution Time: 0.141 ms (10 rows) QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------- - Sort (cost=8.18..8.18 rows=1 width=267) (actual time=0.017..0.018 rows=0 loops=1) + Sort (cost=8.18..8.18 rows=1 width=267) (actual time=0.043..0.043 rows=0 loops=1) Sort Key: cree_le DESC Sort Method: quicksort Memory: 25kB - -> Index Scan using idx_dossiers_id_parent_enfant_statut_cree_le on dossiers d (cost=0.15..8.17 rows=1 width=267) (actual time=0.012..0.012 rows=0 loops=1) + -> Index Scan using idx_dossiers_id_parent_enfant_statut_cree_le on dossiers d (cost=0.15..8.17 rows=1 width=267) (actual time=0.038..0.038 rows=0 loops=1) Index Cond: ((id_parent = '33333333-3333-3333-3333-333333333333'::uuid) AND (id_enfant = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'::uuid)) - Planning Time: 0.062 ms - Execution Time: 0.032 ms + Planning Time: 0.103 ms + Execution Time: 0.067 ms (7 rows) === 14) JSONB : exemples de filtrage ===========================