#!usr/bin/python3
# ====================================================================
# convert a character's utf-8 encoding to the character's
# code point (ord value)
# it does this by manipulating bits and bytes
# ====================================================================
# UTF-8 code point formats and how many bit are available
# for ord values defining unique characters
# 0xxx_xxxx 7 bits
# 110x_xxxx 10xx_xxxx 11 bits
# 1110_xxxx 10xx_xxxx 10xx_xxxx 16 bits
# 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 21 bits
# ====================================================================
# THIS CODE IS INCOMPLETE. IT ONLY CONVERTS 4 BYTE UTF-8 CODE POINTS.
# MODIFY IT TO CONVERT ALL 4 TYPES OF UTF-8 CODE POINTS.
# ====================================================================
import sys
# --------------------------------------------------------------------
# ---- convert byte to string-of-bits, no leading '0's trimmed
# --------------------------------------------------------------------
def my_bin(byt,size=8):
bits = bin(byt)[2:].rjust(size,'0')
return bits
# --------------------------------------------------------------------
# ---- get a list of bytes from an integer
# --------------------------------------------------------------------
def get_bytes(integer):
byts = []
while True:
byt = integer & 0xff
byts.insert(0,byt)
integer = integer >> 8
if integer == 0x0: break
return byts
# --------------------------------------------------------------------
# ---- test if a bit pattern matches a byte
# --------------------------------------------------------------------
def bit_pattern_match(byt,pat):
print(f'bit_pattern_match(byt={my_bin(byt)},pat={my_bin(pat)})')
m = (byt & pat) ^ pat
if m: return False
return True
# --------------------------------------------------------------------
# ---- get an integer from a part of a byte
# --------------------------------------------------------------------
def get_bits_from_byte(byt,pat):
print(f'get_bits_from_byte(byt={my_bin(byt)},pat={my_bin(pat)})')
b = byt & pat
return b
# --------------------------------------------------------------------
# ---- convert string-of-bits to an integer
# --------------------------------------------------------------------
def bits_to_int(bits,val=0):
skip_0 = True
for b in bits:
# ---- skip leading '0's?
if skip_0 and b == '0':
continue
else:
skip_0 = False
val = val * 2
if b == '0':
continue
elif b == '1':
val += 1
else:
print(f'illegal bit character ({b})')
sys.exit()
return val
# --------------------------------------------------------------------
# ---- main
# --------------------------------------------------------------------
if __name__ == '__main__':
utf8_chars = [
0x61, # 1 byte utf-8
0xc2a3, # 2 byte utf-8
0xecbfb4, # 3 byte utf-8
0xf09380b0 # 4 byte utf-8
]
# ---- test/convert utf-8 code points (integers)
for utf8_char in utf8_chars:
print()
print('-'*60)
print(f'utf8_chr is {hex(utf8_char)} 0b{my_bin(utf8_char)}')
byts = get_bytes(utf8_char)
print(f'bytes =',end='')
for b in byts:
print(f' 0x{b:02x}',end='')
print()
# ---- 4 byte utf-8 code point - match first byte
pattern = 0b11110000 # 4 byte utf-8 code point
tf = bit_pattern_match(byts[0],pattern)
if not tf:
print('pattern does not match')
print('-'*60)
continue
print('pattern does match')
# ---- extract ord bits from utf-8 code point bytes
pattern1 = 0b00000111
pattern2 = 0b00111111
print('--- get ord value ' + '- '*15)
print('byte 0')
bits0 = get_bits_from_byte(byts[0],pattern1)
print(f'returned {my_bin(bits0,3)} type={type(bits0)}')
print('byte 1')
bits1 = get_bits_from_byte(byts[1],pattern2)
print(f'returned {my_bin(bits1,6)} type={type(bits1)}')
print('byte 2')
bits2 = get_bits_from_byte(byts[2],pattern2)
print(f'returned {my_bin(bits2,6)} type={type(bits2)}')
print('byte 3')
bits3 = get_bits_from_byte(byts[3],pattern2)
print(f'returned {my_bin(bits3,6)} type={type(bits3)}')
ord_bits = my_bin(bits0,3) + my_bin(bits1,6) +\
my_bin(bits2,6) + my_bin(bits3,6)
print(f'ord_bits={ord_bits} type={type(ord_bits)}')
# ---- convert string-of-bits to an integer
ord_int = bits_to_int(ord_bits)
print(f'ord_int={ord_int} type={type(ord_int)}')
# ---- print character associated to ord value
print(f'chr = {chr(ord_int)}')
print('-'*60)