Skip to main content

excel-charset.py (Source)

# The list of charsets understood by Python 3.13, omitted for brevity here.
charsets = ['ascii', 'big5', 'big5hkscs', 'cp037', ...]
# Read the CSV file as bytes
with open('registration.csv','rb') as f:
    d = f.read()
# Try to read character ``i`` from the file using the given charset.
def try_read(charset,i):
    s = d[i:i+1]
    try:
        return s.decode(charset)
    except UnicodeDecodeError:
        return None
# Positions of characters that have caused problems, and what they should decode to.
strings = [(4071,'å'), (6270,'š')]
# Work through each of the problems and narrow down the list of charsets.
sets = charsets[:]
for i,str in strings:
    sets = [s for s in sets if try_read(s,i)==str]
    print(i,str,sets)