clean.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import polars as pl
  2. import os
  3. def transform_data(df: pl.DataFrame) -> pl.DataFrame:
  4. """
  5. Transform data according to the specified OpenRefine operations
  6. Args:
  7. df: Input DataFrame with the original structure
  8. Returns:
  9. Transformed DataFrame
  10. """
  11. df_cleaned = df.cast({"Betrag": pl.Float32})
  12. # Filter out rows where 'Betrag' is 0 or empty
  13. df_cleaned = df_cleaned.filter(pl.col("Betrag") != 0.0).filter(
  14. pl.col("Betrag") is not None
  15. )
  16. return df_cleaned
  17. def process_file(input_file: str, inplace: bool = True) -> None:
  18. """
  19. Process a single CSV file and save the transformed version
  20. Args:
  21. input_file: Path to the input CSV file
  22. """
  23. # Generate output filename
  24. if inplace:
  25. output_file = input_file
  26. else:
  27. filename, ext = os.path.splitext(input_file)
  28. output_file = f"{filename}_clean{ext}"
  29. print(f"Processing {input_file}...")
  30. try:
  31. # Read the CSV file
  32. df = pl.read_csv(input_file)
  33. # Transform the data
  34. cleaned_df = transform_data(df)
  35. # Save the transformed data
  36. cleaned_df.write_csv(output_file)
  37. print(f"Successfully saved cleaned data to {output_file}")
  38. except Exception as e:
  39. print(f"Error processing {input_file}: {str(e)}")
  40. def main():
  41. # List of files to process
  42. files_to_process = [
  43. "nat.csv",
  44. "juristische.csv",
  45. "afd_nat.csv",
  46. "afd_juristische.csv",
  47. ]
  48. # Process each file
  49. for file in files_to_process:
  50. process_file(file)
  51. if __name__ == "__main__":
  52. main()