ParseCSV.py

#! /usr/bin/python3
# ===================================================================
# parse a string containing CSVs and return a list
# containing the values
#
# Note: This is based on my Perl CSV parser.
# ===================================================================
# Note:
#   1. Some data may have values that are not surrounded by
#      quotes but have imbedded quotes in them.  This does not
#      follow the rules for normal CSV strings and makes
#      the parser more complex.  None of the existing parsers
#      I found were able to handle this kind of data.
#   2. Imbedded quotes (" or ') in a value are returned as part of
#      the value.
#   3. The parser recognizes the following value formats:
#        "ab"c"  ,        returns (ab"c)      
#        "ab'c"  <EOS>    returns (ab'c)
#        'ab,c' ,         returns (ab,c)
#        'a bc'  <EOS>    returns (a bc)
#        ab c ,           returns (ab c)
#        ab c     <EOS>   returns (ab c)
#   4. The comma is a separator, not a terminator. The string "a,"
#      returns two values (strings), an "a" and an empty string.
#      (EOS terminates the last string.)
#   5. The \n must be removed from a string before parsing it.
#      (Lines read fron a files include \n characters.)
#   6. Values are returned with leading and trailing spaces
#      removed.
#   7. I coded this the hard (long) way. I did not use a
#      convoluted, complex regular expression. I think it is
#      easier to see what is going on this way.
# ===================================================================
# Something to test: instead of using string slice, use/add group(2)
#                    in the regular expressions. which is more
#                    efficient (faster)?
# ===================================================================

import re

# ---- compile regexp patterns (non-greedy matching)

p1 = re.compile(r'^\s*"(.*?)"\s*,')
p2 = re.compile(r'^\s*"(.*?)"\s*$')

p3 = re.compile(r"^\s*'(.*?)'\s*,")
p4 = re.compile(r"^\s*'(.*?)'\s*$")

p5 = re.compile(f'^\s*(["\'])[^\1]*$')

p6 = re.compile(r'^\s*(.*?)\s*,')
p7 = re.compile(r'^\s*(.*)\s*$')


# -------------------------------------------------------------------
# ---- parse CSV string, return a list of values
# -------------------------------------------------------------------

def ParseCSV(csvstr):

    # ---- is there a string to parse?

    if len(csvstr) < 1:
        ##print('empty CSV string')
        return [] 

    lst = []

    # ---------------------------------------------------------------
    # ---- helper function
    # ---- process a matched CSV at the beginning of the CSV string
    # ---- 1. add the matched CSV to the list of values
    # ---- 2. return the remainder of string (matched CSV removed)
    # ---------------------------------------------------------------
    
    def csv(m):

        mgroup = m.group(1)
        mstart = m.start()
        mend   = m.end()

        ##print(f'm.start    = {mstart}')
        ##print(f'm.end      = {mend}')
        ##print(f'm.group(1) = {mgroup}')

        lst.append(mgroup)

        return csvstr[mend:]

    # ---------- loop thru the CSV string

    while(True):

        ##print('-------------------------------------')
        ##print(f'CSV string = {csvstr}')

        # ---- match any of the CSV patterns?

        m = p1.match(csvstr)
        if m:
            csvstr = csv(m)
            continue

        m = p2.match(csvstr)
        if m:
            csv(m)
            break

        m = p3.match(csvstr)
        if m:
            csvstr = csv(m)
            continue

        m = p4.match(csvstr)
        if m:
            csv(m)
            break

        m = p5.match(csvstr)
        if m:
            lst = []
            break

        m = p6.match(csvstr)
        if m:
            csvstr = csv(m)
            continue

        m = p7.match(csvstr)
        if m:
            csv(m)
            break

        lst = []
        break

    return lst

# -------------------------------------------------------------------
# ---- main
# -------------------------------------------------------------------

if __name__ == '__main__':

    import user_interface as ui

    if not ui.running_python3():
        print('end program - not running Python3')
        quit()

    while True:                # loop

        ui.clear_screen()
        print()
        s = ui.get_user_input('Enter CSV string: ')
        if not s:              # empty string?
            break

        if s == 'empty':        # test an empty string 
            s = '' 

        lst = ParseCSV(s)

        print()
        print('---- end of CSV parse ----')
        print(f'CSV string is {s}')
        print(f'CSV list length is {len(lst)}')
        if len(lst) < 1:
            print('CSV list is empty')
        else:
            print('CSV list:')
            for s in lst:
                print(f'->  ({s})')
        ui.pause()