UTF-8 characters in Python

See: About the Unicode Character Database

# Anglo-Saxon code points 16A0 16A2 16A6 16A8 16A8 16A9 16B1 16B3 16B7 16B9 16BB 16EB 16C1 16C4 16C7 16C8 16C9 16CB 16CF 16D2 16D6 16D7 16DA 16DD 16DE 16DF 16AA 16Ab 16A3 16E0 16E3 16B8 16E4 16E1 16E2 16E5 16EB 16EC 16ED

#!/usr/bin/python3 # ========================================================== # demonstrate the number of bytes and bits in characters # ========================================================== # ---------------------------------------------------------- # ---- string length in bytes # ---------------------------------------------------------- def utf8len(s:str) -> int: return len(s.encode('utf-8')) # ---------------------------------------------------------- # ---- convert each byte in a string into a string of bits # ---------------------------------------------------------- def bit_string(s:str) -> str: # ----convert string to a list of bytes byts = s.encode('utf-8') # ---- convert bytes to a list of bit strings bin_strs = [] for byt in byts: bin_strs.append(f'{byt:08b}') # ---- combine bit strings into a single string return ' '.join(bin_strs) # ---------------------------------------------------------- # ---- display a string's bytes and bits # ---------------------------------------------------------- def display_string_bytes_and_bits(s:str) -> None: print() print(f'str="{s}" len={len(s)} (char) ' +\ f'sizeof={utf8len(s)} (bytes)') print() print(f'bit string is {bit_string(s)}') # ---------------------------------------------------------- # ---- main # ---------------------------------------------------------- print() print('---------- single character ASCII') display_string_bytes_and_bits('A') print() print('---------- single character UTF-8') display_string_bytes_and_bits('\u16A0') print() print('---------- multiple characters') display_string_bytes_and_bits('A\u16A0B')