12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- import polars as pl
- import os
- def transform_data(df: pl.DataFrame) -> pl.DataFrame:
- """
- Transform data according to the specified OpenRefine operations
- Args:
- df: Input DataFrame with the original structure
- Returns:
- Transformed DataFrame
- """
- df_cleaned = df.cast({"Betrag": pl.Float32})
- # Filter out rows where 'Betrag' is 0 or empty
- df_cleaned = df_cleaned.filter(pl.col("Betrag") != 0.0).filter(
- pl.col("Betrag") is not None
- )
- return df_cleaned
- def process_file(input_file: str, inplace: bool = True) -> None:
- """
- Process a single CSV file and save the transformed version
- Args:
- input_file: Path to the input CSV file
- """
- # Generate output filename
- if inplace:
- output_file = input_file
- else:
- filename, ext = os.path.splitext(input_file)
- output_file = f"{filename}_clean{ext}"
- print(f"Processing {input_file}...")
- try:
- # Read the CSV file
- df = pl.read_csv(input_file)
- # Transform the data
- cleaned_df = transform_data(df)
- # Save the transformed data
- cleaned_df.write_csv(output_file)
- print(f"Successfully saved cleaned data to {output_file}")
- except Exception as e:
- print(f"Error processing {input_file}: {str(e)}")
- def main():
- # List of files to process
- files_to_process = [
- "nat.csv",
- "juristische.csv",
- "afd_nat.csv",
- "afd_juristische.csv",
- ]
- # Process each file
- for file in files_to_process:
- process_file(file)
- if __name__ == "__main__":
- main()
|