UTF-8 characters in Python

#!/usr/bin/python3 # ================================================================ # Unicode Anglo-Saxon runes, etc. # 1. display runes # 2. use enters Unicode code point as a string of hex digits # display it's character info # ---------------------------------------------------------------- # Project: generalize to any type of integer input # (bin, oct, dec, hex) # ================================================================ import unicodedata as ud import user_interface as ui # ---- Unicode code points (Anglo-Saxton runes) code_points = [ "16A0", "16A2", "16A6", "16A8", "16A8", "16A9", "16B1", "16B3", "16B7", "16B9", "16BB", "16EB", "16C1", "16C4", "16C7", "16C8", "16C9", "16CB", "16CF", "16D2", "16D6", "16D7", "16DA", "16DD", "16DE", "16DF", "16AA", "16Ab", "16A3", "16E0", "16E3", "16B8", "16E4", "16E1", "16E2", "16E5", "16EB", "16EC", "16ED" ] # ---------------------------------------------------------------- # ---- Function: string length in bytes # ---------------------------------------------------------------- def utf8len(s:str) -> int: return len(s.encode('utf-8')) # ----------------------------------------------------------------- # ---- Function: convert each byte in a string # ---- into a string of bits # ----------------------------------------------------------------- def bit_string(s:str) -> str: # ----convert string to a list of bytes byts = s.encode('utf-8') # ---- convert bytes to a list of bit strings bin_strs = [] for byt in byts: bin_strs.append(f'{byt:08b}') # ---- combine bit strings into a single string return ' '.join(bin_strs) # ---------------------------------------------------------------- # ---- Function: code point a valid Unicode character? # ---- Cn - not assigned # ---- Cs - surrogate # ---- Co - private use # ---- Note: some valid code points are unprintable characters # ---- i.e. catigory Cc (displayed as a ' ' character) # ---------------------------------------------------------------- def is_valid(code_point:int) -> bool: if code_point >= 0x110000: return False return ud.category(chr(code_point)) not in ('Cn', 'Cs', 'Co') # ---------------------------------------------------------------- # ---- Function: convert a string of hex digits to an integer # ---------------------------------------------------------------- def is_hex_integer(s:str) -> tuple: try: n = int(s,16) return (True,n) except Exception as e: print(f's = {s}') print(f'e = {e}') return (False,0) # ---------------------------------------------------------------- # ---- Function: display Unicode characters # ---- (assumes a list of valid hex code point strings) # ---------------------------------------------------------------- def display_characters(code_points:list[int]) -> None: print() for s in code_points: tf,i = is_hex_integer(s) if not tf: break print(f'{s} is {chr(i)}') # ----------------------------------------------------------------- # ---- Function: display a character's bytes and bits # ----------------------------------------------------------------- def display_character_bytes_and_bits(s:str) -> None: print() print(f'chr="{s}" len={len(s)} (char) ' +\ f'sizeof={utf8len(s)} (bytes)') print() print(f'bit string is {bit_string(s)}') # ---------------------------------------------------------------- # ---- main # ---------------------------------------------------------------- ##print() ##print(f'{len(code_points)} Anglo-Saxon rune characters') ##display_characters(code_points) while True: print() s = ui.get_user_input('Enter code point (hex): ') if not s: break tf,i = is_hex_integer(s) if not tf: print() print(f'input ({s}) is not a hex integer string') continue if not is_valid(i): print() print(f'input ({s}) is not a valid code point') continue display_character_bytes_and_bits(chr(i))