#! /usr/bin/python3 # =================================================================== # parse a string containing CSVs and return a list # containing the values # # Note: This is based on my Perl CSV parser. # =================================================================== # Note: # 1. Some data may have values that are not surrounded by # quotes but have imbedded quotes in them. This does not # follow the rules for normal CSV strings and makes # the parser more complex. None of the existing parsers # I found were able to handle this kind of data. # 2. Imbedded quotes (" or ') in a value are returned as part of # the value. # 3. The parser recognizes the following value formats: # "ab"c" , returns (ab"c) # "ab'c" <EOS> returns (ab'c) # 'ab,c' , returns (ab,c) # 'a bc' <EOS> returns (a bc) # ab c , returns (ab c) # ab c <EOS> returns (ab c) # 4. The comma is a separator, not a terminator. The string "a," # returns two values (strings), an "a" and an empty string. # (EOS terminates the last string.) # 5. The \n must be removed from a string before parsing it. # (Lines read fron a files include \n characters.) # 6. Values are returned with leading and trailing spaces # removed. # 7. I coded this the hard (long) way. I did not use a # convoluted, complex regular expression. I think it is # easier to see what is going on this way. # =================================================================== # Something to test: instead of using string slice, use/add group(2) # in the regular expressions. which is more # efficient (faster)? # =================================================================== import re # ---- compile regexp patterns (non-greedy matching) p1 = re.compile(r'^\s*"(.*?)"\s*,') p2 = re.compile(r'^\s*"(.*?)"\s*$') p3 = re.compile(r"^\s*'(.*?)'\s*,") p4 = re.compile(r"^\s*'(.*?)'\s*$") p5 = re.compile(f'^\s*(["\'])[^\1]*$') p6 = re.compile(r'^\s*(.*?)\s*,') p7 = re.compile(r'^\s*(.*)\s*$') # ------------------------------------------------------------------- # ---- parse CSV string, return a list of values # ------------------------------------------------------------------- def ParseCSV(csvstr): # ---- is there a string to parse? if len(csvstr) < 1: ##print('empty CSV string') return [] lst = [] # --------------------------------------------------------------- # ---- helper function # ---- process a matched CSV at the beginning of the CSV string # ---- 1. add the matched CSV to the list of values # ---- 2. return the remainder of string (matched CSV removed) # --------------------------------------------------------------- def csv(m): mgroup = m.group(1) mstart = m.start() mend = m.end() ##print(f'm.start = {mstart}') ##print(f'm.end = {mend}') ##print(f'm.group(1) = {mgroup}') lst.append(mgroup) return csvstr[mend:] # ---------- loop thru the CSV string while(True): ##print('-------------------------------------') ##print(f'CSV string = {csvstr}') # ---- match any of the CSV patterns? m = p1.match(csvstr) if m: csvstr = csv(m) continue m = p2.match(csvstr) if m: csv(m) break m = p3.match(csvstr) if m: csvstr = csv(m) continue m = p4.match(csvstr) if m: csv(m) break m = p5.match(csvstr) if m: lst = [] break m = p6.match(csvstr) if m: csvstr = csv(m) continue m = p7.match(csvstr) if m: csv(m) break lst = [] break return lst # ------------------------------------------------------------------- # ---- main # ------------------------------------------------------------------- if __name__ == '__main__': import user_interface as ui if not ui.running_python3(): print('end program - not running Python3') quit() while True: # loop ui.clear_screen() print() s = ui.get_user_input('Enter CSV string: ') if not s: # empty string? break if s == 'empty': # test an empty string s = '' lst = ParseCSV(s) print() print('---- end of CSV parse ----') print(f'CSV string is {s}') print(f'CSV list length is {len(lst)}') if len(lst) < 1: print('CSV list is empty') else: print('CSV list:') for s in lst: print(f'-> ({s})') ui.pause()