dir_file_list.py

#! /usr/bin/python3
# ==================================================================
# get a list of specific files in a diretory
# ==================================================================

import re
import os

# -------------------------------------------------------------------
# test if a string matches one of a list/tuple of regular expressions
#
# Regular expressions use the backslash character ('\') to
# indicate special forms or to allow special characters to
# be used without invoking their special meaning. This collides
# with Python's usage of the same character for the same purpose
# in string literals. The solution is to use Python's raw string
# notation for regular expression patterns; backslashes are not
# handled in any special way in a string literal prefixed with 'r'.
# r"\n" is a two-character string containing.
#
# For example to match html files: r'\.html$' or '\\.html$'
# -------------------------------------------------------------------

def StringMatchPattern(patterns,str):

    for p in patterns:
        if re.search(p,str,re.IGNORECASE):
            return True
    return False

# -------------------------------------------------------------------
# get a list of regular files (not directories or links)
# that match a given regular expression
# -------------------------------------------------------------------

def GetListOfFiles(includefiles,dir,filelist,verbose=False):
    '''
    Get a list of regular files that match a regular expression.
    (not directories or links)

    Attributes:

      includefiles - list or tuple of files to be captured
                     They are RegExp patters.

      dir          - directory to be captured

      filelist     - list of captured files
                     (path + file name)

      verbose      - print messages describing what the code is doing
    '''

    if verbose:
        print("GetListOfFiles({})".format(dir))

    # --- dir must end in a '/'

    if re.search('\/$',dir):
        dir = dir + '/'

    if verbose:
        print("Searching dir  {}".format(dir))

    # --- compile regular expression

    # --- get a list of entries in the directory

    files = os.listdir(dir)

    # --- add a file to the list

    for f in files:

        # ---- skip hidden files and directories
        # ---- note: they start with a period '.'

        if re.search('^\.',f):
            continue

        if verbose:
            print("testing   file {}".format(f))

        ff = dir + f

        # ---- skip links and directories

        if os.path.islink(ff):
            ##print('skipping link {}'.format(ff))
            continue

        if os.path.isdir(ff):
            ##print('skipping dir  {}'.format(ff))
            continue

        # ---- skip the file?

        if not StringMatchPattern(includefiles,f):
            if verbose:
                print('skipping  file {}'.format(f))
            continue

        filelist.append(ff)

    return True

# ==================================================================
# main - testing
#
# inclidefiles - a list or a tuple of RegEx search patterens
# dir          - the directory to search for files
# ==================================================================

if __name__ == '__main__':

    includefiles = [ "\\.html$", r"\.py$" ]

    dir = '/var/www/html'

    dirfiles = []

    print('---- files -----------------------------------------')

    if not GetListOfFiles(includefiles,dir,dirfiles):
        print("GetListOfFiles failed")
    else:
        if len(dirfiles) < 1:
            print("No files found in directory ({})".format(dir))
        else:
            dirfiles.sort()

            for f in dirfiles:
                print(f)
            print()
            print("{} files found".format(len(dirfiles)))
            print()