Convert UTF-8 Character Encoding to the Character Code Point (ORD) Value

#!/usr/bin/python3 # ==================================================================== # convert a utf-8 character's code point to the character's ord value # Note: there is got to be a better way to do this? # ==================================================================== # Shown are UTF-8 code point formats and how many bits are # available for code point values that define characters. # # 0_xxx_xxxx 7 bits # 110x_xxxx 10xx_xxxx 11 bits # 1110_xxxx 10xx_xxxx 10xx_xxxx 16 bits # 11110_xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 21 bits # ==================================================================== import re import sys # -------------------------------------------------------------------- # ---- convert a bit string ('1's and '0's) to an integer # ---- val is the starting integer value # -------------------------------------------------------------------- def bits_to_int(bits,val=0): for b in bits: val = val*2 if b == '0': val += 0 elif b == '1': val += 1 else: error_exit(f'Illegal bit character (b)') return val # -------------------------------------------------------------------- # ---- display error message then exit # -------------------------------------------------------------------- def error_exit(msg,title=None): if title: print(title) else: print('Converting utf-8 code point to ord') print(msg) sys.exit() # -------------------------------------------------------------------- # ---- pad ASCII character's bit string to fill it to 8 bits # ---- note: the bin function truncates leading zeros # -------------------------------------------------------------------- def pad_ascii_bits(bits): l = len(bits) if l < 8: if l == 7: bits = '0' + bits elif l == 6: bits = '00' + bits elif l == 5: bits = '000' + bits elif l == 4: bits = '0000' + bits elif l == 3: bits = '00000' + bits elif l == 2: bits = '000000' + bits elif l == 1: bits = '0000000' + bits return bits # -------------------------------------------------------------------- # ---- convert a utf-8 code point to an ord value # ---- Note: UTF-8 code point is an integer # -------------------------------------------------------------------- def convert_utf8_to_ord(utf8_code_point): bits = bin(utf8_code_point)[2:] # ---- pad 1 byte utf-8 (ASCII) characters with '0's # ---- to make it 8 bits long if len(bits) < 8: bits = pad_ascii_bits(bits) # ---- length of utf-8 bit string bit_count = len(bits) byte_count_int = int(bit_count/8) byte_count_frac = int(bit_count%8) if byte_count_frac > 0: error_exit(f'incorrect byte count ({bit_count/8})') # ---- valid byte count? if byte_count_int < 1 or byte_count_int > 4: error_exit(f'byte count error ({byte_count_int})') # ---- byte count is 1 --------------------------------- if byte_count_int == 1: print('\nprocess 1 byte utf-8') if not re.match(r'^0',bits): msg = '1 byte utf-8 did not start with \'0\'' error_exit(msg) val = bits_to_int(bits[1:]) return (val,bits) # ---- byte count is 2 --------------------------------- if byte_count_int == 2: print('\nprocess 2 byte utf-8') utf_byte = bits[0:8] if not re.match(r'^110',utf_byte): msg = '2 byte utf-8 did not start with \'110\'' error_exit(msg) val = bits_to_int(utf_byte[2:]) utf_byte = bits[8:16] if not re.match(r'^10',utf_byte): msg = f'byte {i} of 4 byte utf-8 ' +\ 'did not start with \'10\'' error_exit(msg) val = bits_to_int(utf_byte[2:],val) return (val,bits) # ---- byte count is 3 --------------------------------- if byte_count_int == 3: print('\nprocess 3 byte utf-8') utf_byte = bits[0:8] if not re.match(r'^1110',utf_byte): msg = '3 byte utf-8 did not start with \'1110\'' error_exit(msg) val = bits_to_int(utf_byte[3:]) for i in range(1,3): utf_byte = bits[i*8:(i*8)+8] if not re.match(r'^10',utf_byte): msg = f'byte {i} of 3 byte utf-8 ' +\ 'did not start with \'10\'' error_exit(msg) val = bits_to_int(utf_byte[2:],val) return (val,bits) # ---- byte count is 4 -------------------------------- if byte_count_int == 4: print('\nprocess 4 byte utf-8') utf_byte = bits[0:8] if not re.match(r'^11110',utf_byte): msg = '4 byte utf-8 did not start with \'11110\'' error_exit(msg) val = bits_to_int(utf_byte[5:]) for i in range(1,4): utf_byte = bits[i*8:(i*8)+8] if not re.match(r'^10',utf_byte): msg = f'byte {i} of 4 byte utf-8 ' +\ 'did not start with \'10\'' error_exit(msg) val = bits_to_int(utf_byte[2:],val) return (val,bits) # -------------------------------------------------------------------- # ---- main # -------------------------------------------------------------------- if __name__ == '__main__': utf8_code_points = [ 0x41, # 1 byte utf-8 ascii 'A' 0x61, # 1 byte utf-8 ascii 'a' 0xc2a3, # 2 byte utf-8 0xc2b1, # 2 byte utf-8 0xe090af, # 3 byte utf-8 Cyrillic character 0xecbfb4, # 3 byte utf-8 0xf09f9881, # 4 byte utf-8 emoji 0xf09380b0 # 4 byte utf-8 Egyptian hieroglyph ] for cp in utf8_code_points: val,bits = convert_utf8_to_ord(cp) # ---- when all is said and done, what did we end up with? try: print(f'cp = 0x{cp:x} (utf-8 code point)') print(f'val = {val}') print(f'bits= {bits}') print(f'bin = {bin(val)}') print(f'hex = {hex(val)}') print(f'chr = {chr(val)}') except Exception as e: print(str(e))