dataset.py

#!/usr/bin/python3
# ==================================================================
# FROM: www.youtube.com/watch?v=u4rsA5ZiTls
# This INCREDIBLE trick will speed up your data processes.
# ==================================================================

import pandas as pd
import numpy as np

csv_file = 'dataset.csv'

def get_dataset(size):
   '''create fake dataset'''
   df = pd.DataFrame()
   df['size'] = np.random.choice(['big','medium','small'],size)
   df['age']  = np.random.randint(1,50,size)
   df['team'] = np.random.choice(['red','blue','yellow','green'],size)
   df['win']  = np.random.choice(['yes','no'],size)
   dates      = pd.date_range('2020-01-01','2022-12-31')
   df['date'] = np.random.choice(dates,size)
   df['prob'] = np.random.uniform(0,1,size)
   return df

def set_dtypes(df):
   '''set dataset column data types'''
   df['size'] = df['size'].astype('category')
   df['age']  = df['age'].astype('int16')
   df['team'] = df['team'].astype('category')
   ##df['win']  = df['win'].map({'yes':True,'no':False})
   df['win']  = df['win'].astype('bool')
   df['prob'] = df['prob'].astype('float16')
   return df

# ------------------------------------------------------------------
# ---- create dataset (without set dtypes)
# ------------------------------------------------------------------
print()
print('='*50)
print('create dataset (without set dtypes) ' + '-'*14)
print('='*50)
print()

df = get_dataset(10_000)
print('create ds info ' + '-'*35)
df.info()
print('create ds head ' + '-'*36)
print(df.head())

# ---- CSV

print()
print('write csv ' + '-'*40)
print()

df.to_csv(csv_file)
##df.to_csv(csv_file,index=False)
##df.to_csv(csv_file,index=True)

## look at size of file
## command line: ls -GFlash test_csv.csv

print('read csv ' + '-'*41)
print()

df = pd.read_csv(csv_file)
##df = pd.read_csv(csv_file,index_col=[0])

print('read csv info ' + '-'*36)
df.info()
print('read csv head ' + '-'*36)
print(df.head())

# ------------------------------------------------------------------
# ---- create dataset (with set_dtypes)
# ------------------------------------------------------------------
print()
print('='*50)
print('create dataset (with set dtypes) ' + '-'*17)
print('='*50)
print()

df = get_dataset(10_000)
df = set_dtypes(df)
print('create ds info ' + '-'*35)
df.info()
print('create ds head ' + '-'*36)
print(df.head())

# ---- CSV

print()
print('write csv ' + '-'*40)
print()

df.to_csv(csv_file)
##df.to_csv(csv_file,index=False)
##df.to_csv(csv_file,index=True)

## look at size of file
## command line: ls -GFlash test_csv.csv

print('read csv ' + '-'*41)
df.info()
print('read csv head ' + '-'*36)
print(df.head())