modification du clean csv et du makefile
This commit is contained in:
parent
55d4b4ff85
commit
eea1bd8c4b
8
makefile
8
makefile
@ -36,10 +36,18 @@ import:
|
|||||||
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \
|
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \
|
||||||
sleep 2; \
|
sleep 2; \
|
||||||
done
|
done
|
||||||
|
@echo "🧹 Nettoyage des CSV avant import (local -> in-place)..."
|
||||||
|
@python3 scripts/clean_csv.py
|
||||||
|
@if [ $$? -ne 0 ]; then echo "❌ Erreurs lors du nettoyage des CSV - corrige les fichiers avant import"; exit 1; fi
|
||||||
@echo "📥 Import des données de démo..."
|
@echo "📥 Import des données de démo..."
|
||||||
@docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -f $(IMPORT_SQL)
|
@docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -f $(IMPORT_SQL)
|
||||||
@echo "✅ Données importées."
|
@echo "✅ Données importées."
|
||||||
|
|
||||||
|
set-dev-admin-pw:
|
||||||
|
@echo "🔐 Définir mot de passe dev pour admin@ptits-pas.fr (admin123)"
|
||||||
|
@docker exec -i $(PG_CONTAINER) psql -U $(PG_USER) -d $(PG_DB) -c "UPDATE utilisateurs SET password = crypt('admin123', gen_salt('bf')) WHERE email = 'admin@ptits-pas.fr';"
|
||||||
|
@echo "✅ Mot de passe admin dev mis à jour."
|
||||||
|
|
||||||
verify:
|
verify:
|
||||||
@echo "⏳ Attente que Postgres démarre pour vérifier..."
|
@echo "⏳ Attente que Postgres démarre pour vérifier..."
|
||||||
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \
|
@until docker exec -i $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB); do \
|
||||||
|
|||||||
@ -1,59 +1,123 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
"""scripts/clean_csv.py
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/clean_csv.py [input_dir] [--out-dir OUT] [--dry-run]
|
||||||
|
|
||||||
|
This script cleans CSV files (trim, remove 'NULL', fix column count) and
|
||||||
|
performs simple validations (UUID-looking columns, date columns).
|
||||||
|
"""
|
||||||
import csv
|
import csv
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
|
||||||
def clean_csv_file(file_path: Path):
|
UUID_RE = re.compile(r'^[0-9a-fA-F-]{36}$')
|
||||||
"""
|
|
||||||
Nettoie un CSV directement en place :
|
|
||||||
- remplace "NULL" par ""
|
def is_uuid(s: str) -> bool:
|
||||||
- supprime les espaces parasites
|
return bool(UUID_RE.match(s))
|
||||||
- force le même nombre de colonnes que l'en-tête
|
|
||||||
"""
|
|
||||||
|
def looks_like_date(s: str) -> bool:
|
||||||
|
if not s:
|
||||||
|
return False
|
||||||
|
# try ISO-like detection
|
||||||
|
try:
|
||||||
|
datetime.fromisoformat(s)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def clean_csv_file(file_path: Path, out_path: Path = None, dry_run: bool = False):
|
||||||
cleaned_rows = []
|
cleaned_rows = []
|
||||||
|
errors = []
|
||||||
|
|
||||||
with open(file_path, "r", encoding="utf-8-sig", newline="") as infile:
|
with file_path.open("r", encoding="utf-8-sig", newline="") as infile:
|
||||||
reader = csv.reader(infile)
|
reader = csv.reader(infile)
|
||||||
try:
|
try:
|
||||||
header = next(reader)
|
header = next(reader)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
print(f"[⚠️] Fichier vide : {file_path}")
|
return {'file': str(file_path), 'error': 'empty', 'errors': []}
|
||||||
return
|
|
||||||
|
|
||||||
nb_cols = len(header)
|
nb_cols = len(header)
|
||||||
cleaned_rows.append([h.strip() for h in header])
|
cleaned_rows.append([h.strip() for h in header])
|
||||||
|
|
||||||
|
# Heuristics for validations
|
||||||
|
uuid_cols = [i for i, h in enumerate(header) if h.lower() == 'id' or h.lower().endswith('_id')]
|
||||||
|
date_cols = [i for i, h in enumerate(header) if any(k in h.lower() for k in ('date', '_le', '_at'))]
|
||||||
|
|
||||||
for i, row in enumerate(reader, start=2):
|
for i, row in enumerate(reader, start=2):
|
||||||
# Ajuste le nombre de colonnes
|
|
||||||
if len(row) < nb_cols:
|
if len(row) < nb_cols:
|
||||||
row.extend([""] * (nb_cols - len(row)))
|
row.extend([""] * (nb_cols - len(row)))
|
||||||
elif len(row) > nb_cols:
|
elif len(row) > nb_cols:
|
||||||
row = row[:nb_cols]
|
row = row[:nb_cols]
|
||||||
|
|
||||||
# Nettoyage cellule par cellule
|
# Nettoyage cellule par cellule
|
||||||
row = [cell.strip().replace("NULL", "") for cell in row]
|
row = [cell.strip().replace('NULL', '') for cell in row]
|
||||||
|
|
||||||
|
# Validations simples
|
||||||
|
for ci in uuid_cols:
|
||||||
|
if ci < len(row) and row[ci]:
|
||||||
|
if not is_uuid(row[ci]):
|
||||||
|
errors.append(f"Line {i}: column {header[ci]} not UUID-like: {row[ci]!r}")
|
||||||
|
for ci in date_cols:
|
||||||
|
if ci < len(row) and row[ci]:
|
||||||
|
if not looks_like_date(row[ci]):
|
||||||
|
errors.append(f"Line {i}: column {header[ci]} not ISO-like date: {row[ci]!r}")
|
||||||
|
|
||||||
cleaned_rows.append(row)
|
cleaned_rows.append(row)
|
||||||
|
|
||||||
# Réécriture dans le même fichier
|
# Write output if not dry-run
|
||||||
with open(file_path, "w", encoding="utf-8", newline="") as outfile:
|
if not dry_run:
|
||||||
writer = csv.writer(outfile)
|
target = out_path if out_path else file_path
|
||||||
writer.writerows(cleaned_rows)
|
with target.open('w', encoding='utf-8', newline='') as outfile:
|
||||||
|
writer = csv.writer(outfile)
|
||||||
|
writer.writerows(cleaned_rows)
|
||||||
|
|
||||||
print(f"[✔] Nettoyé : {file_path}")
|
return {'file': str(file_path), 'errors': errors}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) > 1:
|
parser = argparse.ArgumentParser()
|
||||||
base_dir = Path(sys.argv[1])
|
parser.add_argument('input_dir', nargs='?', default='bdd/data_test')
|
||||||
|
parser.add_argument('--out-dir', '-o', help='Optional output directory for cleaned files')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Do not write files, just report')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
base_dir = Path(args.input_dir)
|
||||||
|
out_dir = Path(args.out_dir) if args.out_dir else None
|
||||||
|
if out_dir and not out_dir.exists():
|
||||||
|
out_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for file_path in sorted(base_dir.glob('*.csv')):
|
||||||
|
out_path = out_dir / file_path.name if out_dir else None
|
||||||
|
res = clean_csv_file(file_path, out_path, args.dry_run)
|
||||||
|
results.append(res)
|
||||||
|
|
||||||
|
# Report
|
||||||
|
any_errors = False
|
||||||
|
for r in results:
|
||||||
|
if 'error' in r:
|
||||||
|
print(f"[WARN] {r['file']}: {r['error']}")
|
||||||
|
if r.get('errors'):
|
||||||
|
any_errors = True
|
||||||
|
print(f"[ERR] {r['file']} ->")
|
||||||
|
for e in r['errors'][:20]:
|
||||||
|
print(' -', e)
|
||||||
|
else:
|
||||||
|
print(f"[OK] {r['file']}")
|
||||||
|
|
||||||
|
if any_errors:
|
||||||
|
print('\nSome files have validation issues. Fix them or run with --dry-run to inspect.')
|
||||||
|
sys.exit(2)
|
||||||
else:
|
else:
|
||||||
base_dir = Path("bdd/data_test")
|
print('\nAll files cleaned successfully.')
|
||||||
|
|
||||||
print(f"🔎 Nettoyage des CSV dans : {base_dir}")
|
|
||||||
for file_path in base_dir.glob("*.csv"):
|
|
||||||
clean_csv_file(file_path)
|
|
||||||
|
|
||||||
print(f"✅ Terminé. Les fichiers CSV ont été écrasés (nettoyés en place).")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user