build_index_html_file.py

#!/usr/bin/python3
# ===================================================================
# Build a HTML file (index.html) containing links to selected
# files in a directory. A simple HTML link is created.
# -------------------------------------------------------------------
# This script assumes that only a single directory will be
# searched/processed. Therefore, there will be no duplicate 
# file names. If the script is modified to search/process
# more than one directory, duplicate file names are possible.
# -------------------------------------------------------------------
# The web server may try to execute the file pointed to by the
# link created by this script. This simple script does nothing
# about this problem.
# ===================================================================

import re
import os
import datetime


# -------------------------------------------------------------------
# global variables, constants, etc.
# -------------------------------------------------------------------

AUTHOR        = 'Tom Wolfe'      # HTML file author

CSSFILE       = 'xxxx.css'       # HTML CSS file

DIRECTORY     = './'             # directory to be searched/processed
                                 # ending '/' character required

FILEMATCHPATS = [ r'\.html$',    # file name patterns - match regular expresion
                  r'\.pdf$',
                  r'\.png$',
                  r'\.py$',
                  r'\.txt$',
                  r'\.css$',
                  r'\.bat$' ]

FILESKIPPATS  = ['^index.html$'] # file name patterns - skip regular expressions

OUTFILE       = './index.html'   # output file


# -------------------------------------------------------------------
# output start web page
# -------------------------------------------------------------------

def start_web_page(ofile,dir,author=None,stylesheet=None):

    ofile.write('<!DOCTYPE html>\n')
    ofile.write('<html>\n')
    ofile.write('<head>\n')
    ofile.write('<meta charset="utf-8" />\n')

    if author is not None:
        ofile.write('<meta name="author" content="{}" />\n'.
        format(author))

    if stylesheet is not None:
        ofile.write('<link rel="stylesheet" href="{}" />\n'.
        format(stylesheet))

    ofile.write('</head>\n')
    ofile.write('<body>\n')
    ofile.write('<header>\n')
    ofile.write('<center>Dir: {}</center>\n'.format(dir))
    ofile.write('</header>\n')
    ofile.write('<div class="indent12">\n')


# -------------------------------------------------------------------
# output end of web page
# -------------------------------------------------------------------

def end_web_page(ofile):

    d = datetime.datetime.now()

    dd = d.strftime('%B %Y')

    ofile.write('</div>\n')
    ofile.write('<footer>\n')
    ofile.write('<modate>Last Modified: {}</modate>\n'.format(dd))
    ofile.write('</footer>\n')
    ofile.write('</body>\n')
    ofile.write('</html>')


# -------------------------------------------------------------------
# test if a string matches one of a list of regular expressions
#
# Regular expressions use the backslash character ('\') to
# indicate special forms or to allow special characters to
# be used without invoking their special meaning. This collides
# with Python’s usage of the same character for the same purpose
# in string literals. The solution is to use Python’s raw string
# notation for regular expression patterns; backslashes are not
# handled in any special way in a string literal prefixed with 'r'.
# r"\n" is a two-character string containing.
#
# For example to match html files: 'r\.html$' or '\\.html$'
# -------------------------------------------------------------------

def string_match_pattern(patterns,str):

    for p in patterns:
        if re.search(p,str,re.IGNORECASE):
            return True
    return False


# -------------------------------------------------------------------
# return a list (dictionary) of selected file names
#
# dir  directory to search/process
# mpat  list of file match regular expressions
# spat  list of file skip  regular expressions
# -------------------------------------------------------------------

def get_list_of_files(dir,mpat,spat):

    dct = {}                   # file dictionary (list of file)

    # --- get a list of entries in the directory

    files = os.listdir(dir)

    # ---- add files to the list

    for f in files:

        # ---- file path and name

        ff = dir + f

        # ---- skip hidden files (file name starts with a '.')

        if re.search('^\.',f):
            ##print('skipping hidden file {}'.format(ff))
            continue

        # ---- skip links and directories

        if os.path.islink(ff):
            ##print('skipping link {}'.format(ff))
            continue

        if os.path.isdir(ff):
            ##print('skipping dir {}'.format(ff))
            continue

        # ---- skip the file name?

        if string_match_pattern(spat,f):
            ##print('skipping file {}'.format(f))
            continue

        # ---- match the file name?

        if not string_match_pattern(mpat,f):
            ##print('skipping match file {}'.format(f))
            continue

        # ---- save the selected file name in a dictionary
        # ---- dictionary key   = file name
        # ---- dictionary value = path + file name

        ##print('adding match file {}'.format(f))

        dct[f] = ff

    # ---- return the dictionary (list of file)

    return dct 


# -------------------------------------------------------------------
# add links to the output web page
#
# ofile  output file
# dct    is a dictionary containing selected file names
#        dictionary key   = file name
#        dictionary value = path + file name
# -------------------------------------------------------------------

def create_web_page_links(ofile,dct):

    ofile.write('<p>\n')

    c = 0                       # link count

    for k in sorted(dct.keys()):

        if c != 0:
            ofile.write('<br>\n')

        ofile.write('<a href="{}">{}</a>\n'.format(dct[k],k))

        c += 1                  # increment link count

    ofile.write('</p>\n')


# ===================================================================
# main
# ===================================================================

if __name__ == '__main__':

    # ---- does the directory to search/process exists? 

    if not os.path.isdir(DIRECTORY):
        print()
        print('No directory found')
        print('Output file NOT created or modified')
        print('DIRECTORY   : {}'.format(DIRECTORY))
        print('OUTPUT FILE : {}'.format(OUTFILE))
        print()
        quit()

    # ---- fix the directory name string (if we need too)
    # ---- it must end in '/' or be empty
    # ---- (belt and suspenders - double check)

    if len(DIRECTORY) > 0:
        if not re.search('\/$',DIRECTORY):
            DIRECTORY = DIRECTORY + '/'

    # ---- get a dictionary of selected files

    dct = get_list_of_files(DIRECTORY,FILEMATCHPATS,FILESKIPPATS)

    # ---- any files found to process?

    if len(dct) == 0:
        print()
        print('No files found in directory to process')
        print('Output file NOT created or modified')
        print('DIRECTORY   : {}'.format(DIRECTORY))
        print('OUTPUT FILE : {}'.format(OUTFILE))
        for p in FILESKIPPATS:
            print('FILE SKIP  PATTERN: {}'.format(p))
        for p in FILEMATCHPATS:
            print('FILE MATCH PATTERN: {}'.format(p))
        print()
        quit()

    # ---- create output file

    ofile = open(OUTFILE,"w",encoding="utf-8")

    start_web_page(ofile,DIRECTORY,AUTHOR,CSSFILE)

    create_web_page_links(ofile,dct)

    end_web_page(ofile)

    ofile.close()

    print()
    print('{} links written to file'.format(len(dct)))
    print()