Create Your Own GREP Utility

Introduction

GREP is a Linux/Unix utility program. It searches one or more files for lines containing a match to a specified pattern (regular expression). It prints lines that contain a match.

Google "grep" for more information.

Project #0

What does GREP stand for?

Project #1

Create your own Python3 version of GREP. Below is my version written in Perl. You can use it as a template.

Project #2

Create a reverse version of GREP that prints the names of files it did not find a match in.

Why the reverse version of GREP? Assume you are modifying several files. You could use the "reveres" version of GREP to tell you which files have yet to be modified.

Project #3

Modify project #1 to use a list of search patterns. (Enter a CSV string on the command line? or read them from a file? or some other way?)

My GREP Utility Written in Perl

#!/bin/perl -w
#=======================================================================
# MY GREP is a very simplified version of the Linux/Unix utility
# program. However, MY GREP has enough functionality for my needs.
#=======================================================================

use strict;
use DirHandle;
use fileHandle;

#-----------------------------------------------------------------------
# global variables
#
# CASESENSITIVE flag - case sensitive searches
# DEBUG         flag - display debug messages
# DIRCOUNT      count - directories processed
# FAILURE       return value - failure
# FILECOUNT     count - files processed
# MATCHCOUNT    count - number of matched found in processed files
# SUBDIR        flag - process subdirectories
# ROOTDIR       root directory
# STARTTIME     script start time
# SUCCESS       return value - success
# TOTALLINES    count - lines in all of the files processed
# VERBOSE       flag - display verbose messages
#-----------------------------------------------------------------------

my $CASESENSITIVE = 0;
my $DEBUG         = 0;
my $DIRCOUNT      = 0;
my $FAILURE       = 0;
my $FILECOUNT     = 0;
my $FILEPATTERN   = '\.html$';    # regular expression (escape any "/")
my $STRINGPATTERN = 'error';      # regular expression (escape any "/")
my $MATCHCOUNT    = 0;
my $SUBDIR        = 0;
my $ROOTDIR       = './';         # must end in "/"
my $STARTTIME     = time;
my $TOTALLINES    = 0;
my $SUCCESS       = 1;
my $VERBOSE       = 0;

# process command line arguments

if (ProcessCommandLineArguments() != $SUCCESS) { exit 1; }

# display runtime environment

print "==================================================\n";
print 'Start time      = ' . localtime($STARTTIME) . "\n";
print "File pattern    = $FILEPATTERN\n";
print "Root dir        = $ROOTDIR\n";
print "Debug           = $DEBUG\n";
print "String pattern  = $STRINGPATTERN\n";
print "Subdir          = $SUBDIR\n";
print "Verbose         = $VERBOSE\n";
print "Case sensitive  = $CASESENSITIVE\n";
print "==================================================\n";

if (ProcessDirectoryTree($ROOTDIR,$FILEPATTERN,
                         $STRINGPATTERN) != $SUCCESS) { exit 1; }

 # display processing statistics

print "\n";
print "==================================================\n";
print "Dirs processed  = $DIRCOUNT\n";
print "Files processed = $FILECOUNT\n";
print "Matchs found    = $MATCHCOUNT\n";
print "Lines processed = $TOTALLINES\n";
print "==================================================\n";

exit 0;

#=======================================================================
# subroutines
#=======================================================================

#----------------------------------------------------------------------
# process a directory tree
#----------------------------------------------------------------------

sub ProcessDirectoryTree
{
   my $dir  = $_[0];              # directory (path and name)
   my $fpat = $_[1];              # file name match pattern
   my $spat = $_[2];              # string match pattern

   $DIRCOUNT++;                   # increment directory count

   if ($DEBUG) { print "\nProcessing Directory $dir\n"; }

   # get a list of all of the file names in the directory

   my $dh = new DirHandle;        # directory handle

   if (! opendir($dh,$dir))
   {
      print "\n";
      print "Error opening directory $dir\n";
      print "$!\n";
      print "\n";
      return $FAILURE;
    }

   my @fl = readdir $dh;          # array of file names

   closedir $dh;

   # process each file name

   my $f;                         # file name
   my $fs;                        # file specification (path + name)

   foreach $f (@fl)
   {
      # ignore  special cases

      if ($f eq ".")  { next; };
      if ($f eq "..") { next; };

      # create a file specification (path + name)

      $fs = $dir . $f;

      #  do not process subdirectories at this time

      if (-d $fs) { next; }

      # if it not a directory, does the file name match pattern
      # if yes, process the file

      if ($f =~ /$fpat/)
      {
	 if (ProcessFile($f,$fs,$spat) != $SUCCESS) { return $FAILURE; }
      }
   }

   if ($SUBDIR)                   # process subdirectories
   {
      foreach $f (@fl)
      {
         # ignore  special cases

         if ($f eq ".")  { next; };
         if ($f eq "..") { next; };

         # create a file specification (path + name)

         $fs = $dir . $f;

         #  process subdirectories

         if (-d $fs)
         {
            if (ProcessDirectoryTree($fs,$fpat,$spat) != $SUCCESS)
            { return $FAILURE; }
         }
      }
   }

   return $SUCCESS;
}

#-----------------------------------------------------------------------
# process a file
#-----------------------------------------------------------------------

sub ProcessFile

{
   my $f    = $_[0];           # file name
   my $fs   = $_[1];           # file specification (path + name)
   my $spat = $_[2];           # string pattern

   if ($DEBUG || $VERBOSE) { print "   Processing file $f\n"; }

   $FILECOUNT++;               # increment file count

   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   # open the input and output files
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

   if (! open(IN,"<$fs"))
   {
      print "\n";
      print "Error: can not open input file $fs\n";
      print "$!\n";
      print "\n";
      return $FAILURE;
   }

   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   # read the file one line at a time
   # test if the string pattern matched the line
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

   my $c  = 0;                 # file line number
   my $l;                      # line from file

   while(<IN>)
   {
      $l = $_;                 # copy the line

      chomp $l;                # remove trailing \n from the line 

      $c++;                    # increment line count

      if ($CASESENSITIVE)
      {
         if ($l =~ /$spat/)
         {
           $MATCHCOUNT++;
           print "   $f" . "[$c]  $l\n";
         }
      }
      else
      {
         if ($l =~ /$spat/i)
         {
           $MATCHCOUNT++;
           print "   $f" . "[$c]  $l\n";
         }
      }
   }

   $TOTALLINES += $c;

   # end of file

   close(IN);

   return $SUCCESS;
}

#-----------------------------------------------------------------------
# process command line arguments
#-----------------------------------------------------------------------

sub ProcessCommandLineArguments
{
   my $e;                     # command line argument
   my $ee;                    # command line argument (uppercase)

   my $sp = 1;
   my $fp = 1;

   for $e (@ARGV)
   {
      $ee = $e;
      $ee =~ tr/a-z/A-Z/;

      if ($ee eq 'DEBUG')         { $DEBUG         = 1; next; }
      if ($ee eq 'VERBOSE')       { $VERBOSE       = 1; next; }
      if ($ee eq 'CASESENSITIVE') { $CASESENSITIVE = 1; next; }
      if ($ee eq 'CASE')          { $CASESENSITIVE = 1; next; }
      if ($ee eq 'SENSITIVE')     { $CASESENSITIVE = 1; next; }

      if ($sp)
      {
         $STRINGPATTERN = $e;
         $sp = 0;
         next;
      }

      if ($fp)
      {
         $FILEPATTERN = $e;
         $fp = 0;
         next;
      }

      print "\nError - Extra/Unknown parameter on command line\n";
      return $FAILURE;
   }

   return $SUCCESS;
}

Windows Batch File

@echo off
rem =========================================================
rem window batch file to run my GREP
rem =========================================================

echo(
echo ---------- My Grep ----------

echo(
set /p s="Enter regexp search pattern [error] : "

IF "%s%x" EQU "x" (
   set s=error
)

echo(
set /p f="Enter regexp file name pattern [\.html] : "

IF "%f%x" EQU "x" (
   set f=\.html$
)

perl -w grep.pl "%s%" "%f%"

pause

Search For Files Python Code

#!/usr/bin/python3
# ===================================================================
# search multiple directories for regular files that match
# one of multiple file name patterns (regular expression)
# ===================================================================

import re
import os

pats = [r'\.html$', r'\.py$', r'\.txt$' ]  # list of
                                           #   file name patterns
dirs = ['.', './z']                        # list of directories


# -------------------------------------------------------------------
# create a list of all file names that match a specified pattern
# in a directory
#
# dirs   list of directories to search
# pats   list of regexp to match file names
# lst    returned list of files names (path + name)
# -------------------------------------------------------------------

def list_of_files(dirs,pats):

    lst = []

    # ---- for each directory

    for d in dirs:

        # ---- names of files in directory

        fils = os.listdir(d)

        # ---- directory (path) must end in '/'

        if not re.search('\/$',d):
            d = d + '/'

        for f in fils:

            # ---- file path + name

            ff = d + f

            # ---- regular file?

            if not os.path.isfile(ff):
                continue

            # ---- file name matches a pattern?

            for p in pats:
                if re.search(p,f):
                    # ---- save absolute path + name
                    lst.append(os.path.abspath(ff)))
                    break
    return lst

# -------------------------------------------------------------------
# ---- main
# -------------------------------------------------------------------

if __name__ == '__main__':

    lst = list_of_files(dirs,pats)  # get list of files

    print(sorted(lst))              # print sorted file list