#! /usr/bin/python3 # ================================================================== # Use regexp to break apart lines in a file and put them back # together modified. In particular, the lines (rows) of a # html table. # ================================================================== import re IN = 'html_special_characters.html' ic = 0 # input line count mc = 0 # modified line count oc = 0 # output line count pt = '</td><td>' # regexp pattern inFile = open(IN,'r') for line in inFile: ic += 1 # icrement input count line = line.strip() # ---- is this a table row? if re.match("^.*?</td>",line) != None: # ---- split the line x = re.split(pt,line) ##for xx in x: ## print(xx) # ---- make sure we have the correct number elements l = len(x) if l != 4: print(x) print('Error: wrong number of elements {} found'.format(l)) break # ---- re-combine the line elements # ---- Do not modify element x[[1] if it start with a '&' # ---- else add '&' and ';' if it does not if re.match('^&',x[1]) != None: s = x[0] + pt + x[1] + pt + x[2] + pt + x[3] else: s = x[0] + pt + '&' + x[1] + ';' + pt + x[2] + pt + x[3] print(s) # output the recombined line mc += 1 # increment modified count else: # ---- output non-row lines print(line) oc += 1 # increment output count inFile.close() ##print('{} lines input'.format(ic)) ##print('{} lines modified'.format(mc)) ##print('{} lines output'.format(oc))