word_statistics.py

# =========================================================
# Read a text file and report statistics about the
# words found in it
# =========================================================

import math
import sys
import os
import platform

# ---------------------------------------------------------
# global variables
#
# punctuation  punctuation characters
# wdict        dict  - holds words and word counts
# wline        count - lines in file
# wtotal       count - total number of words found in file
# wunique      count - unique words in file
# ---------------------------------------------------------

punctuation = '''.?!,;:-_()[]{}"'/\\'''
wdict       = dict()
wline       = 0
wtotal      = 0
wunique     = 0

# ---------------------------------------------------------
# am I running Python 3?
# ---------------------------------------------------------

def RunningPython3():
    ##print(sys.version_info)
    if sys.version_info[0] == 3:
        return True
    return False

# ---------------------------------------------------------
# get user input (Python 2 or 3)
# ---------------------------------------------------------

def GetUserInput(prompt,py3):
    if py3:
        i = input(prompt)
    else:
        i = raw_input(prompt)
    return i.strip()

# ---------------------------------------------------------
# pause program
# ---------------------------------------------------------

def Pause(py3):
    print('')
    GetUserInput('Press enter to continue ',py3)

# ---------------------------------------------------------
# clear the screen
# ---------------------------------------------------------

def ClearScreen():
    if platform.system() == 'Linux':
        os.system('clear')
    elif platform.system() == 'Windows':
        os.system('clear')
    else:
        os.system('cls')

# ---------------------------------------------------------
# test for a punctuation character at the end of a string
# ---------------------------------------------------------
# puncutuation characters are: period,question mark,
# exclamation mark, comma, semicolon, colon, dash,
# hyphen, parenthese, brackets, braces, apostrophe,
# quote marks and ellipsis.
# ---------------------------------------------------------
# Note: depending on OS, editor settings, etc. dashs,
# hyphens, apostrophes, elilipsis, and quote marks
# may appear differently in the text file
# ---------------------------------------------------------

def HasPunctuation(str,py3):
    global punctuation
    if str[-1] in punctuation:
        return True
    return False

# ---------------------------------------------------------
# process a text file
# ---------------------------------------------------------

def ProcessTextFile(file,py3):
    global wline
    i = 0
    inFile = open(file,'r')
    for line in inFile:
        line = line.strip()
        if line:
           ##print(line)
           ProcessTextLine(line,py3)
           wline += 1
    inFile.close()

# ---------------------------------------------------------
# process a line (string) of text
#
# convert words to lowercase and remove any punctuation at
# the end of words - count the words
# ---------------------------------------------------------

def ProcessTextLine(line,py3):
    global wdict, wline, wtotal, wunique
    wlist = line.split()
    wc = 0
    for w in wlist:
        w = w.lower()
        if HasPunctuation(w,py3):
            w = w[0:-1]
        if w in wdict:
            wdict[w] += 1
        else:
            wdict[w] = 1
            wunique += 1
        wc += 1
        wtotal += 1
    return wc

# ---------------------------------------------------------
# main
# ---------------------------------------------------------

if __name__ == '__main__':

    py3 = RunningPython3()

    ##file = 'gettysburg_address.txt'
    file = 'declaration_of_independence.txt'

    ProcessTextFile(file,py3)

    # -----------------------------------------------------
    # display the words found in the text file
    # -----------------------------------------------------
    ##
    ##print('')
    ##
    ###normal order
    ##for k,v in wdict.iteritems():
    ##    print('{}: {}'.format(k,v))
    ##
    ###sort on key
    ##for k in sorted(wddict.iterkeys()):
    ##    print('{}: {}'.format(k,wdict[k]))
    ##
    ###sort on value (count)
    ##i = 0
    ##for k,v in sorted(wdict.iteritems(), reverse=True,
    ##                  key=lambda (k,v): (v,k)):
    ##    print('[{:02}] {:>4}: {}'.format(i,v,k))
    ##    i += 1
    ##Pause(py3)

    # -----------------------------------------------------
    # display text file statistics
    # -----------------------------------------------------

    print('')
    print('Text file: {}'.format(file))
    print('{} unique words in text file'.format(wunique))
    print('{} words in text file'.format(wtotal))
    Pause(py3)

    # -----------------------------------------------------
    # search the dictionary for a specific word
    # -----------------------------------------------------

    while True:

        ClearScreen()

        print('------ Search for Word ------')
        print('')
        w = GetUserInput('Enter search word: ',py3)

        if w == '':
            break

        w = w.lower()

        if w in wdict:
            print('')
            print('Found {}, word count is {}'.format(w,wdict[w]))
        else:
            print('')
            print('{} not found'.format(w))
        Pause(py3)

    print('')