12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- #!/bin/sh
- usage () {
- cat <<EOF>&2
- Usage: ${0##*/} FILES...
- Convert all "badly" encoded text files to UTF-8/LF without BOM.
- EOF
- }
- [ $# -eq 0 ] && usage && exit 1
- [ "$1" = "-h" ] && usage && exit
- [ "$1" = "--" ] && shift
- for i ; do
- [ ! -f "$i" ] && continue
- mimetype=$(file -bi "$i")
- description=$(file -b "$i")
- type=$(echo "$mimetype" | awk -F/ '{print $1}')
- [ "$type" != text ] && continue
- charset=$(echo "$mimetype" | awk -F "=" '{print $2}')
- if [ "$charset" != utf-8 ]; then
- echo "$i: Convert to UTF-8"
- iconv -f "$charset" -t utf8 "$i" -o "$i"
- fi
- if echo "$description" | grep -q 'UTF-8 Unicode (with BOM)'; then
- echo "$i: Remove BOM"
- ex -sc '1s/^.//|xit' "$i"
- ## Interesting alternatives:
- # dd iflag=skip_bytes skip=3 if=file.srt of=temp.srt
- # dd bs=1 skip=3 if=file.srt of=temp.srt
- # tail -c +32 file.srt > temp.srt
- fi
- if echo "$description" | grep -q 'CRLF'; then
- echo "$i: Remove CR"
- ex -sc '%s/
//g|xit' "$i"
- fi
- done
|