utf8fy 965 B

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #!/bin/sh
  2. usage () {
  3. cat <<EOF>&2
  4. Usage: ${0##*/} FILES...
  5. Convert all "badly" encoded text files to UTF-8/LF without BOM.
  6. EOF
  7. }
  8. [ $# -eq 0 ] && usage && exit 1
  9. [ "$1" = "-h" ] && usage && exit
  10. [ "$1" = "--" ] && shift
  11. for i ; do
  12. [ ! -f "$i" ] && continue
  13. mimetype=$(file -bi "$i")
  14. description=$(file -b "$i")
  15. type=$(echo "$mimetype" | awk -F/ '{print $1}')
  16. [ "$type" != text ] && continue
  17. charset=$(echo "$mimetype" | awk -F "=" '{print $2}')
  18. if [ "$charset" != utf-8 ]; then
  19. echo "$i: Convert to UTF-8"
  20. iconv -f "$charset" -t utf8 "$i" -o "$i"
  21. fi
  22. if echo "$description" | grep -q 'UTF-8 Unicode (with BOM)'; then
  23. echo "$i: Remove BOM"
  24. ex -sc '1s/^.//|xit' "$i"
  25. ## Interesting alternatives:
  26. # dd iflag=skip_bytes skip=3 if=file.srt of=temp.srt
  27. # dd bs=1 skip=3 if=file.srt of=temp.srt
  28. # tail -c +32 file.srt > temp.srt
  29. fi
  30. if echo "$description" | grep -q 'CRLF'; then
  31. echo "$i: Remove CR"
  32. ex -sc '%s/ //g|xit' "$i"
  33. fi
  34. done