modification du clean csv et du makefile

This commit is contained in:
951095 2025-09-19 12:44:06 +02:00
parent 55d4b4ff85
commit eea1bd8c4b
2 changed files with 99 additions and 27 deletions

View File

@ -36,10 +36,18 @@ import:
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \ @until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \
sleep 2; \ sleep 2; \
done done
@echo "🧹 Nettoyage des CSV avant import (local -> in-place)..."
@python3 scripts/clean_csv.py
@if [ $$? -ne 0 ]; then echo "❌ Erreurs lors du nettoyage des CSV - corrige les fichiers avant import"; exit 1; fi
@echo "📥 Import des données de démo..." @echo "📥 Import des données de démo..."
@docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -f $(IMPORT_SQL) @docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -f $(IMPORT_SQL)
@echo "✅ Données importées." @echo "✅ Données importées."
set-dev-admin-pw:
@echo "🔐 Définir mot de passe dev pour admin@ptits-pas.fr (admin123)"
@docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -c "UPDATE utilisateurs SET password = crypt('admin123', gen_salt('bf')) WHERE email = 'admin@ptits-pas.fr';"
@echo "✅ Mot de passe admin dev mis à jour."
verify: verify:
@echo "⏳ Attente que Postgres démarre pour vérifier..." @echo "⏳ Attente que Postgres démarre pour vérifier..."
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \ @until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \

View File

@ -1,59 +1,123 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""scripts/clean_csv.py
Usage:
python3 scripts/clean_csv.py [input_dir] [--out-dir OUT] [--dry-run]
This script cleans CSV files (trim, remove 'NULL', fix column count) and
performs simple validations (UUID-looking columns, date columns).
"""
import csv import csv
import sys import sys
from pathlib import Path from pathlib import Path
from datetime import datetime
import argparse
import re
def clean_csv_file(file_path: Path): UUID_RE = re.compile(r'^[0-9a-fA-F-]{36}$')
"""
Nettoie un CSV directement en place :
- remplace "NULL" par "" def is_uuid(s: str) -> bool:
- supprime les espaces parasites return bool(UUID_RE.match(s))
- force le même nombre de colonnes que l'en-tête
"""
def looks_like_date(s: str) -> bool:
if not s:
return False
# try ISO-like detection
try:
datetime.fromisoformat(s)
return True
except Exception:
return False
def clean_csv_file(file_path: Path, out_path: Path = None, dry_run: bool = False):
cleaned_rows = [] cleaned_rows = []
errors = []
with open(file_path, "r", encoding="utf-8-sig", newline="") as infile: with file_path.open("r", encoding="utf-8-sig", newline="") as infile:
reader = csv.reader(infile) reader = csv.reader(infile)
try: try:
header = next(reader) header = next(reader)
except StopIteration: except StopIteration:
print(f"[⚠️] Fichier vide : {file_path}") return {'file': str(file_path), 'error': 'empty', 'errors': []}
return
nb_cols = len(header) nb_cols = len(header)
cleaned_rows.append([h.strip() for h in header]) cleaned_rows.append([h.strip() for h in header])
# Heuristics for validations
uuid_cols = [i for i, h in enumerate(header) if h.lower() == 'id' or h.lower().endswith('_id')]
date_cols = [i for i, h in enumerate(header) if any(k in h.lower() for k in ('date', '_le', '_at'))]
for i, row in enumerate(reader, start=2): for i, row in enumerate(reader, start=2):
# Ajuste le nombre de colonnes
if len(row) < nb_cols: if len(row) < nb_cols:
row.extend([""] * (nb_cols - len(row))) row.extend([""] * (nb_cols - len(row)))
elif len(row) > nb_cols: elif len(row) > nb_cols:
row = row[:nb_cols] row = row[:nb_cols]
# Nettoyage cellule par cellule # Nettoyage cellule par cellule
row = [cell.strip().replace("NULL", "") for cell in row] row = [cell.strip().replace('NULL', '') for cell in row]
# Validations simples
for ci in uuid_cols:
if ci < len(row) and row[ci]:
if not is_uuid(row[ci]):
errors.append(f"Line {i}: column {header[ci]} not UUID-like: {row[ci]!r}")
for ci in date_cols:
if ci < len(row) and row[ci]:
if not looks_like_date(row[ci]):
errors.append(f"Line {i}: column {header[ci]} not ISO-like date: {row[ci]!r}")
cleaned_rows.append(row) cleaned_rows.append(row)
# Réécriture dans le même fichier # Write output if not dry-run
with open(file_path, "w", encoding="utf-8", newline="") as outfile: if not dry_run:
target = out_path if out_path else file_path
with target.open('w', encoding='utf-8', newline='') as outfile:
writer = csv.writer(outfile) writer = csv.writer(outfile)
writer.writerows(cleaned_rows) writer.writerows(cleaned_rows)
print(f"[✔] Nettoyé : {file_path}") return {'file': str(file_path), 'errors': errors}
def main(): def main():
if len(sys.argv) > 1: parser = argparse.ArgumentParser()
base_dir = Path(sys.argv[1]) parser.add_argument('input_dir', nargs='?', default='bdd/data_test')
parser.add_argument('--out-dir', '-o', help='Optional output directory for cleaned files')
parser.add_argument('--dry-run', action='store_true', help='Do not write files, just report')
args = parser.parse_args()
base_dir = Path(args.input_dir)
out_dir = Path(args.out_dir) if args.out_dir else None
if out_dir and not out_dir.exists():
out_dir.mkdir(parents=True)
results = []
for file_path in sorted(base_dir.glob('*.csv')):
out_path = out_dir / file_path.name if out_dir else None
res = clean_csv_file(file_path, out_path, args.dry_run)
results.append(res)
# Report
any_errors = False
for r in results:
if 'error' in r:
print(f"[WARN] {r['file']}: {r['error']}")
if r.get('errors'):
any_errors = True
print(f"[ERR] {r['file']} ->")
for e in r['errors'][:20]:
print(' -', e)
else: else:
base_dir = Path("bdd/data_test") print(f"[OK] {r['file']}")
print(f"🔎 Nettoyage des CSV dans : {base_dir}") if any_errors:
for file_path in base_dir.glob("*.csv"): print('\nSome files have validation issues. Fix them or run with --dry-run to inspect.')
clean_csv_file(file_path) sys.exit(2)
else:
print(f"✅ Terminé. Les fichiers CSV ont été écrasés (nettoyés en place).") print('\nAll files cleaned successfully.')
if __name__ == "__main__": if __name__ == '__main__':
main() main()