modify_html_file.py

#! /usr/bin/python3
# ==================================================================
# Use regexp to break apart lines in a file and put them back
# together modified. In particular, the lines (rows) of a
# html table.
# ==================================================================

import re

IN = 'html_special_characters.html'

ic = 0                         # input line count
mc = 0                         # modified line count
oc = 0                         # output line count
pt = '</td><td>'               # regexp pattern


inFile = open(IN,'r')

for line in inFile:

    ic += 1                    # icrement input count

    line = line.strip()

    # ---- is this a table row?

    if re.match("^.*?</td>",line) != None:

        # ---- split the line

        x = re.split(pt,line)

        ##for xx in x:
        ##    print(xx)

        # ---- make sure we have the correct number elements

        l = len(x)
        if l != 4:
            print(x)
            print('Error: wrong number of elements {} found'.format(l))
            break

        # ---- re-combine the line elements
        # ---- Do not modify element x[[1] if it start with a '&'
        # ---- else add '&amp;' and ';' if it does not

        if re.match('^&',x[1]) != None:
            s = x[0] + pt + x[1] + pt + x[2] + pt + x[3]
        else:
            s = x[0] + pt + '&amp;' + x[1] + ';' + pt + x[2] + pt + x[3]

        print(s)               # output the recombined line

        mc += 1                # increment modified count

    else:

        # ---- output non-row lines

        print(line)

    oc += 1                    # increment output count

inFile.close()

##print('{} lines input'.format(ic))
##print('{} lines modified'.format(mc))
##print('{} lines output'.format(oc))