#!/usr/bin/python3
# ====================================================================
# convert a utf-8 character's code point to the character's ord value
# Note: there is got to be a better way to do this?
# ====================================================================
# Shown are UTF-8 code point formats and how many bits are
# available for code point values that define characters.
#
# 0_xxx_xxxx 7 bits
# 110x_xxxx 10xx_xxxx 11 bits
# 1110_xxxx 10xx_xxxx 10xx_xxxx 16 bits
# 11110_xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 21 bits
# ====================================================================
import re
import sys
# --------------------------------------------------------------------
# ---- convert a bit string ('1's and '0's) to an integer
# ---- val is the starting integer value
# --------------------------------------------------------------------
def bits_to_int(bits,val=0):
for b in bits:
val = val*2
if b == '0':
val += 0
elif b == '1':
val += 1
else:
error_exit(f'Illegal bit character (b)')
return val
# --------------------------------------------------------------------
# ---- display error message then exit
# --------------------------------------------------------------------
def error_exit(msg,title=None):
if title:
print(title)
else:
print('Converting utf-8 code point to ord')
print(msg)
sys.exit()
# --------------------------------------------------------------------
# ---- pad ASCII character's bit string to fill it to 8 bits
# ---- note: the bin function truncates leading zeros
# --------------------------------------------------------------------
def pad_ascii_bits(bits):
l = len(bits)
if l < 8:
if l == 7: bits = '0' + bits
elif l == 6: bits = '00' + bits
elif l == 5: bits = '000' + bits
elif l == 4: bits = '0000' + bits
elif l == 3: bits = '00000' + bits
elif l == 2: bits = '000000' + bits
elif l == 1: bits = '0000000' + bits
return bits
# --------------------------------------------------------------------
# ---- convert a utf-8 code point to an ord value
# ---- Note: UTF-8 code point is an integer
# --------------------------------------------------------------------
def convert_utf8_to_ord(utf8_code_point):
bits = bin(utf8_code_point)[2:]
# ---- pad 1 byte utf-8 (ASCII) characters with '0's
# ---- to make it 8 bits long
if len(bits) < 8:
bits = pad_ascii_bits(bits)
# ---- length of utf-8 bit string
bit_count = len(bits)
byte_count_int = int(bit_count/8)
byte_count_frac = int(bit_count%8)
if byte_count_frac > 0:
error_exit(f'incorrect byte count ({bit_count/8})')
# ---- valid byte count?
if byte_count_int < 1 or byte_count_int > 4:
error_exit(f'byte count error ({byte_count_int})')
# ---- byte count is 1 ---------------------------------
if byte_count_int == 1:
print('\nprocess 1 byte utf-8')
if not re.match(r'^0',bits):
msg = '1 byte utf-8 did not start with \'0\''
error_exit(msg)
val = bits_to_int(bits[1:])
return (val,bits)
# ---- byte count is 2 ---------------------------------
if byte_count_int == 2:
print('\nprocess 2 byte utf-8')
utf_byte = bits[0:8]
if not re.match(r'^110',utf_byte):
msg = '2 byte utf-8 did not start with \'110\''
error_exit(msg)
val = bits_to_int(utf_byte[2:])
utf_byte = bits[8:16]
if not re.match(r'^10',utf_byte):
msg = f'byte {i} of 4 byte utf-8 ' +\
'did not start with \'10\''
error_exit(msg)
val = bits_to_int(utf_byte[2:],val)
return (val,bits)
# ---- byte count is 3 ---------------------------------
if byte_count_int == 3:
print('\nprocess 3 byte utf-8')
utf_byte = bits[0:8]
if not re.match(r'^1110',utf_byte):
msg = '3 byte utf-8 did not start with \'1110\''
error_exit(msg)
val = bits_to_int(utf_byte[3:])
for i in range(1,3):
utf_byte = bits[i*8:(i*8)+8]
if not re.match(r'^10',utf_byte):
msg = f'byte {i} of 3 byte utf-8 ' +\
'did not start with \'10\''
error_exit(msg)
val = bits_to_int(utf_byte[2:],val)
return (val,bits)
# ---- byte count is 4 --------------------------------
if byte_count_int == 4:
print('\nprocess 4 byte utf-8')
utf_byte = bits[0:8]
if not re.match(r'^11110',utf_byte):
msg = '4 byte utf-8 did not start with \'11110\''
error_exit(msg)
val = bits_to_int(utf_byte[5:])
for i in range(1,4):
utf_byte = bits[i*8:(i*8)+8]
if not re.match(r'^10',utf_byte):
msg = f'byte {i} of 4 byte utf-8 ' +\
'did not start with \'10\''
error_exit(msg)
val = bits_to_int(utf_byte[2:],val)
return (val,bits)
# --------------------------------------------------------------------
# ---- main
# --------------------------------------------------------------------
if __name__ == '__main__':
utf8_code_points = [
0x41, # 1 byte utf-8 ascii 'A'
0x61, # 1 byte utf-8 ascii 'a'
0xc2a3, # 2 byte utf-8
0xc2b1, # 2 byte utf-8
0xe090af, # 3 byte utf-8 Cyrillic character
0xecbfb4, # 3 byte utf-8
0xf09f9881, # 4 byte utf-8 emoji
0xf09380b0 # 4 byte utf-8 Egyptian hieroglyph
]
for cp in utf8_code_points:
val,bits = convert_utf8_to_ord(cp)
# ---- when all is said and done, what did we end up with?
try:
print(f'cp = 0x{cp:x} (utf-8 code point)')
print(f'val = {val}')
print(f'bits= {bits}')
print(f'bin = {bin(val)}')
print(f'hex = {hex(val)}')
print(f'chr = {chr(val)}')
except Exception as e:
print(str(e))