12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- import polars as pl
- import os
- def transform_data(df: pl.DataFrame) -> pl.DataFrame:
- """
- Transform data according to the specified OpenRefine operations
- Args:
- df: Input DataFrame with the original structure
- Returns:
- Transformed DataFrame
- """
- df_cleaned = df.cast({"Betrag": pl.Float32})
-
- df_cleaned = df_cleaned.filter(pl.col("Betrag") != 0.0).filter(
- pl.col("Betrag") is not None
- )
- return df_cleaned
- def process_file(input_file: str, inplace: bool = True) -> None:
- """
- Process a single CSV file and save the transformed version
- Args:
- input_file: Path to the input CSV file
- """
-
- if inplace:
- output_file = input_file
- else:
- filename, ext = os.path.splitext(input_file)
- output_file = f"{filename}_clean{ext}"
- print(f"Processing {input_file}...")
- try:
-
- df = pl.read_csv(input_file)
-
- cleaned_df = transform_data(df)
-
- cleaned_df.write_csv(output_file)
- print(f"Successfully saved cleaned data to {output_file}")
- except Exception as e:
- print(f"Error processing {input_file}: {str(e)}")
- def main():
-
- files_to_process = [
- "nat.csv",
- "juristische.csv",
- "afd_nat.csv",
- "afd_juristische.csv",
- ]
-
- for file in files_to_process:
- process_file(file)
- if __name__ == "__main__":
- main()
|