tree_dir_list.py

#! /usr/bin/python3
# ===================================================================
# get a list of directories in a directory tree
# ===================================================================

import re
import os

# -------------------------------------------------------------------
# test if a string matches one of a list/tuple of regular expressions
#
# Regular expressions use the backslash character ('\') to
# indicate special forms or to allow special characters to
# be used without invoking their special meaning. This collides
# with Python's usage of the same character for the same purpose
# in string literals. The solution is to use Python's raw string
# notation for regular expression patterns; backslashes are not
# handled in any special way in a string literal prefixed with 'r'.
# r"\n" is a two-character string containing.
#
# For example to match html files: r'\.html$' or '\\.html$'
# -------------------------------------------------------------------

def StringMatchPattern(patterns,str):

    for p in patterns:
        if re.search(p,str,re.IGNORECASE):
            return True
    return False

# -------------------------------------------------------------------
# get a list of directories
# -------------------------------------------------------------------

def GetListOfDirs(skipdirs,treedir,dirlist,level=None,verbose=False):
    '''
    Create a list of directories in all or part of a directory tree.
    (no regular files or links)

    Arguments:

      skipdirs - list or tuple of the directories to not
                 capture. They are RegExp patterns.

      treedir  - current directory in the directory tree
                 being processed

      dirlist  - the list of captured directories
                 (path + directory name)

      level    - The number of directory levels to capture
                 level = None -- capture all levels of directories
                 level < 0    -- capture nothing and return
                 level = 0    -- capture this directory
                 level > 0    -- decrement level and keep going

      verbose  - print messages describing what the code is doing
    '''

    if verbose:
        print("GetListOfDirs({},level={},verbose={})".
            format(treedir,level,verbose))

    # have if we have captured enough directory levels?

    if level != None:
        if level < 0:
            return True
        level = level - 1

    # --- treedir must end in '/'

    if not re.search('\/$',treedir):
        treedir = treedir + '/'

    if verbose:
        print("searching dir       {}".format(treedir))

    # --- get a list of entries in the directory

    dirs = os.listdir(treedir)

    # --- add directories to the list

    for d in dirs:

        # ---- skip hidden files and directories
        # ---- note: they start with a period '.' 

        if re.search('^\.',d):
            continue

        if verbose:
            print("testing   dir entry {}".format(d))

        dd = treedir + d

        # ---- skip links

        if os.path.islink(dd):
            ##print('skipping link    {}'.format(dd))
            continue

        # ---- skip non-directories

        if not os.path.isdir(dd):
            ##print('skipping non-dir {}'.format(dd))
            continue

        # ---- skip the directoy?

        if StringMatchPattern(skipdirs,d):
            if verbose:
                print("skipping  dir       {}".format(d))
            continue    

        # ---- add the directory to the list

        ##print("adding {} to dir list".format(dd))

        dirlist.append(dd)

        # ---- search sub-directory

        GetListOfDirs(skipdirs,dd,dirlist,level,verbose)

    return True 

# ===================================================================
# main - testing
#
# skipdirs - a list or a tuple of RegEx search patterens
# treeroot - root of a directory tree to search
# ===================================================================

if __name__ == '__main__':

    skipdirs = [ "^wiki$", "^x$" ]

    treeroot = '/var/www/html'

    treedirs = [treeroot]

    print('---- directories -----------------------------------')

    if not GetListOfDirs(skipdirs,treeroot,treedirs):
        print("GetListOfDirs failed")
    else:
        if len(treedirs) < 1:
            print('No directories found in tree ({})'.format(treeroot))
        else:
            treedirs.sort()
            for d in treedirs:
                print(d)
            print()
            print("{} directories found".format(len(treedirs)))
            print()