Convert UTF-8 Character Encoding to Character Code Point (ORD) Value
A better way?

#!usr/bin/python3 # ==================================================================== # convert a character's utf-8 encoding to the character's # code point (ord value) # it does this by manipulating bits and bytes # ==================================================================== # UTF-8 code point formats and how many bit are available # for ord values defining unique characters # 0xxx_xxxx 7 bits # 110x_xxxx 10xx_xxxx 11 bits # 1110_xxxx 10xx_xxxx 10xx_xxxx 16 bits # 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 21 bits # ==================================================================== # THIS CODE IS INCOMPLETE. IT ONLY CONVERTS 4 BYTE UTF-8 CODE POINTS. # MODIFY IT TO CONVERT ALL 4 TYPES OF UTF-8 CODE POINTS. # ==================================================================== import sys # -------------------------------------------------------------------- # ---- convert byte to string-of-bits, no leading '0's trimmed # -------------------------------------------------------------------- def my_bin(byt,size=8): bits = bin(byt)[2:].rjust(size,'0') return bits # -------------------------------------------------------------------- # ---- get a list of bytes from an integer # -------------------------------------------------------------------- def get_bytes(integer): byts = [] while True: byt = integer & 0xff byts.insert(0,byt) integer = integer >> 8 if integer == 0x0: break return byts # -------------------------------------------------------------------- # ---- test if a bit pattern matches a byte # -------------------------------------------------------------------- def bit_pattern_match(byt,pat): print(f'bit_pattern_match(byt={my_bin(byt)},pat={my_bin(pat)})') m = (byt & pat) ^ pat if m: return False return True # -------------------------------------------------------------------- # ---- get an integer from a part of a byte # -------------------------------------------------------------------- def get_bits_from_byte(byt,pat): print(f'get_bits_from_byte(byt={my_bin(byt)},pat={my_bin(pat)})') b = byt & pat return b # -------------------------------------------------------------------- # ---- convert string-of-bits to an integer # -------------------------------------------------------------------- def bits_to_int(bits,val=0): skip_0 = True for b in bits: # ---- skip leading '0's? if skip_0 and b == '0': continue else: skip_0 = False val = val * 2 if b == '0': continue elif b == '1': val += 1 else: print(f'illegal bit character ({b})') sys.exit() return val # -------------------------------------------------------------------- # ---- main # -------------------------------------------------------------------- if __name__ == '__main__': utf8_chars = [ 0x61, # 1 byte utf-8 0xc2a3, # 2 byte utf-8 0xecbfb4, # 3 byte utf-8 0xf09380b0 # 4 byte utf-8 ] # ---- test/convert utf-8 code points (integers) for utf8_char in utf8_chars: print() print('-'*60) print(f'utf8_chr is {hex(utf8_char)} 0b{my_bin(utf8_char)}') byts = get_bytes(utf8_char) print(f'bytes =',end='') for b in byts: print(f' 0x{b:02x}',end='') print() # ---- 4 byte utf-8 code point - match first byte pattern = 0b11110000 # 4 byte utf-8 code point tf = bit_pattern_match(byts[0],pattern) if not tf: print('pattern does not match') print('-'*60) continue print('pattern does match') # ---- extract ord bits from utf-8 code point bytes pattern1 = 0b00000111 pattern2 = 0b00111111 print('--- get ord value ' + '- '*15) print('byte 0') bits0 = get_bits_from_byte(byts[0],pattern1) print(f'returned {my_bin(bits0,3)} type={type(bits0)}') print('byte 1') bits1 = get_bits_from_byte(byts[1],pattern2) print(f'returned {my_bin(bits1,6)} type={type(bits1)}') print('byte 2') bits2 = get_bits_from_byte(byts[2],pattern2) print(f'returned {my_bin(bits2,6)} type={type(bits2)}') print('byte 3') bits3 = get_bits_from_byte(byts[3],pattern2) print(f'returned {my_bin(bits3,6)} type={type(bits3)}') ord_bits = my_bin(bits0,3) + my_bin(bits1,6) +\ my_bin(bits2,6) + my_bin(bits3,6) print(f'ord_bits={ord_bits} type={type(ord_bits)}') # ---- convert string-of-bits to an integer ord_int = bits_to_int(ord_bits) print(f'ord_int={ord_int} type={type(ord_int)}') # ---- print character associated to ord value print(f'chr = {chr(ord_int)}') print('-'*60)