#!/usr/bin/python3 # ================================================================== # FROM: www.youtube.com/watch?v=u4rsA5ZiTls # This INCREDIBLE trick will speed up your data processes. # ================================================================== import pandas as pd import numpy as np csv_file = 'dataset.csv' def get_dataset(size): '''create fake dataset''' df = pd.DataFrame() df['size'] = np.random.choice(['big','medium','small'],size) df['age'] = np.random.randint(1,50,size) df['team'] = np.random.choice(['red','blue','yellow','green'],size) df['win'] = np.random.choice(['yes','no'],size) dates = pd.date_range('2020-01-01','2022-12-31') df['date'] = np.random.choice(dates,size) df['prob'] = np.random.uniform(0,1,size) return df def set_dtypes(df): '''set dataset column data types''' df['size'] = df['size'].astype('category') df['age'] = df['age'].astype('int16') df['team'] = df['team'].astype('category') ##df['win'] = df['win'].map({'yes':True,'no':False}) df['win'] = df['win'].astype('bool') df['prob'] = df['prob'].astype('float16') return df # ------------------------------------------------------------------ # ---- create dataset (without set dtypes) # ------------------------------------------------------------------ print() print('='*50) print('create dataset (without set dtypes) ' + '-'*14) print('='*50) print() df = get_dataset(10_000) print('create ds info ' + '-'*35) df.info() print('create ds head ' + '-'*36) print(df.head()) # ---- CSV print() print('write csv ' + '-'*40) print() df.to_csv(csv_file) ##df.to_csv(csv_file,index=False) ##df.to_csv(csv_file,index=True) ## look at size of file ## command line: ls -GFlash test_csv.csv print('read csv ' + '-'*41) print() df = pd.read_csv(csv_file) ##df = pd.read_csv(csv_file,index_col=[0]) print('read csv info ' + '-'*36) df.info() print('read csv head ' + '-'*36) print(df.head()) # ------------------------------------------------------------------ # ---- create dataset (with set_dtypes) # ------------------------------------------------------------------ print() print('='*50) print('create dataset (with set dtypes) ' + '-'*17) print('='*50) print() df = get_dataset(10_000) df = set_dtypes(df) print('create ds info ' + '-'*35) df.info() print('create ds head ' + '-'*36) print(df.head()) # ---- CSV print() print('write csv ' + '-'*40) print() df.to_csv(csv_file) ##df.to_csv(csv_file,index=False) ##df.to_csv(csv_file,index=True) ## look at size of file ## command line: ls -GFlash test_csv.csv print('read csv ' + '-'*41) df.info() print('read csv head ' + '-'*36) print(df.head())