Pandas Dataframe
import pandas as pd
import time
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt
#in-house packages
from selfie_tools import vectorized as stv, single as sts
Part 1¶
#Reading tsv from 'read_csv()' function
start = time.time()
df = pd.read_csv("canonical-smiles_Chembl29.tsv", sep = '\t')
end = time.time()
tsv_readtime_1 = end - start
print(f"Time taken to read a CHEMBL29 tsv file by standard 'read_csv()' function: {tsv_readtime_1} seconds")
df.info()
df.head()
#TSV writing time
start = time.time()
df.to_csv('Copy_df.tsv', sep = '\t', index = False)
end = time.time()
tsv_writetime_1 = end - start
print(f"Time taken to write a CHEMBL29 dataframe to a standard tsv file by the Pandas 'to_csv()' function: {tsv_writetime_1} seconds")
file_size_tsv_1 = os.path.getsize('Copy_df.tsv')
print(file_size_tsv_1)
#Writing time to pickle
start = time.time()
df.to_pickle('Copy_df.pkl')
end = time.time()
pickle_writetime_1 = end - start
print(f"Time taken to write a CHEMBL29 dataframe as a pickled object file by the Pandas 'to_pickle()' function: {pickle_writetime_1} seconds")
file_size_pickle_1 = os.path.getsize('Copy_df.pkl')
print(file_size_pickle_1)
#Writing time to parquet
start = time.time()
df.to_parquet('Copy_df.gzip', compression='gzip')
end = time.time()
parquet_writetime_1 = end - start
print(f"Time taken to write a CHEMBL29 dataframe to the binary parquet format by the Pandas 'to_parquet()' function: {parquet_writetime_1} seconds")
file_size_parquet_1 = os.path.getsize('Copy_df.gzip')
print(file_size_parquet_1)
#Reading pickle df
start = time.time()
unpickled_df = pd.read_pickle("Copy_df.pkl")
end = time.time()
pickle_readtime_1 = end - start
print(f"Time taken to read a CHEMBL29 pickled file by 'read_pickle()' function: {pickle_readtime_1} seconds")
unpickled_df.head()
#Reading parquet df
start = time.time()
parquet_df = pd.read_parquet("/home/tandon/DL/Fun+Learning/5minsFame/Df_Parquet/Copy_df.gzip")
end = time.time()
parquet_readtime_1 = end - start
print(f"Time taken to read a CHEMBL29 parquet file by 'read_parquet()' function: {parquet_readtime_1} seconds")
parquet_df.head()
plot_values = [parquet_readtime_1, pickle_readtime_1, tsv_readtime_1]
plot_headers = ['Parquet Read Time' , 'Pickle Read Time', 'TSV Read Time']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files reading time comparison: Part 1", size=18)
plt.xlabel("Different Techniques", size=15)
plt.ylabel("Total time taken in seconds (lower the better)", size=15)
plt.show()
plot_values = [parquet_writetime_1, pickle_writetime_1, tsv_writetime_1]
plot_headers = ['Parquet Write Time' , 'Pickle Write Time', 'TSV Write Time']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files writing time comparison: Part 1", size=18)
plt.xlabel("Different Techniques", size=15)
plt.ylabel("Total time taken in seconds (lower the better)", size=15)
plt.show()
plot_values = [file_size_parquet_1, file_size_pickle_1, file_size_tsv_1]
plot_headers = ['Parquet (gzip) File Size ' , 'Pickle File Size', 'TSV File Size']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files size comparison: Part 1", size=18)
plt.xlabel("Different File Systems", size=15)
plt.ylabel("File size in 'bytes' (lower the better)", size=15)
plt.show()
Part 2¶
###Performing column operations using in-house packages
df.rename(columns = {'canonical_smiles':'Smiles'}, inplace = True)
df = stv.add_selfies(df)
df = stv.add_selfies_list(df)
df.head()
#Writing time tsv 2
start = time.time()
df.to_csv('Copy_wider_df.tsv', sep = '\t', index = False)
end = time.time()
tsv_writetime_2 = end - start
print(f"Time taken to write a wider CHEMBL29 dataframe to a standard tsv file by the Pandas 'to_csv()' function: {tsv_writetime_2} seconds")
file_size_tsv_2 = os.path.getsize('Copy_wider_df.tsv')
print(file_size_tsv_2)
#Reading time tsv 2
start = time.time()
df_wider_tsv = pd.read_csv('Copy_wider_df.tsv', sep = '\t')
end = time.time()
tsv_readtime_2 = end - start
print(f"Time taken to read a wider CHEMBL29 tsv file by standard 'read_csv()' function: {tsv_readtime_2} seconds")
df_wider_tsv.head()
#Notice that elements under the column 'Selfies_List' are somehow compromised
df['Selfies_List'][0]
type(df['Selfies_List'][0])
df_wider_tsv['Selfies_List'][0]
type(df_wider_tsv['Selfies_List'][0])
#Writing time to pickle 2
start = time.time()
df.to_pickle('Copy_wider_df.pkl')
end = time.time()
pickle_writetime_2 = end - start
print(f"Time taken to write a wider CHEMBL29 dataframe as a pickled object file by the Pandas 'to_pickle()' function: {pickle_writetime_2} seconds")
#Reading pickle df 2
start = time.time()
unpickled_wider_df = pd.read_pickle("Copy_wider_df.pkl")
end = time.time()
pickle_readtime_2 = end - start
print(f"Time taken to read a CHEMBL29 pickled file by 'read_pickle()' function: {pickle_readtime_2} seconds")
unpickled_wider_df.head()
#Details maintained
file_size_pickle_2 = os.path.getsize('Copy_wider_df.pkl')
print(file_size_pickle_2)
#Writing time to parquet 2
start = time.time()
df.to_parquet('Copy_wider_df.gzip', compression='gzip')
end = time.time()
parquet_writetime_2 = end - start
print(f"Time taken to write a wider CHEMBL29 dataframe to the binary parquet format by the Pandas 'to_parquet()' function: {parquet_writetime_2} seconds")
#Reading parquet df 2
start = time.time()
parquet_wider_df = pd.read_parquet("Copy_wider_df.gzip")
end = time.time()
parquet_readtime_2 = end - start
print(f"Time taken to read a wider CHEMBL29 parquet file by 'read_parquet()' function: {parquet_readtime_2} seconds")
parquet_wider_df.head()
#Details maintained
file_size_parquet_2 = os.path.getsize('Copy_wider_df.gzip')
print(file_size_parquet_2)
plot_values = [parquet_readtime_2, pickle_readtime_2, tsv_readtime_2]
plot_headers = ['Parquet Read Time' , 'Pickle Read Time', 'TSV Read Time']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files reading time comparison: Part 2", size=18)
plt.xlabel("Different Techniques", size=15)
plt.ylabel("Total time taken in seconds (lower the better)", size=15)
plt.show()
plot_values = [parquet_writetime_2, pickle_writetime_2, tsv_writetime_2]
plot_headers = ['Parquet Write Time' , 'Pickle Write Time', 'TSV Write Time']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files writing time comparison: Part 2", size=18)
plt.xlabel("Different Techniques", size=15)
plt.ylabel("Total time taken in seconds (lower the better)", size=15)
plt.show()
plot_values = [file_size_parquet_2, file_size_pickle_2, file_size_tsv_2]
plot_headers = ['Parquet (gzip) File Size ' , 'Pickle File Size', 'TSV File Size']
plt.figure(figsize=(12,8))
sns.set_style("whitegrid")
sns.barplot(x = plot_headers, y = plot_values)
plt.title("ChEMBL29 files size comparison: Part 2", size=18)
plt.xlabel("Different File Systems", size=15)
plt.ylabel("File size in 'bytes' (lower the better)", size=15)
plt.show()
What is 'Parquet'?¶
Apache Parquet (Copyright 2018 Apache Software Foundation), is a "columnar storage format available to any project in the Hadoop ecosystem, regardless of the choice of data processing framework, data model or programming language."
What does Pandas user document says about Parquet?¶
Apache Parquet provides a partitioned binary columnar serialization for data frames. It is designed to make reading and writing data frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible while still maintaining good read performance.
Parquet is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas dtypes, including extension dtypes such as datetime with tz.
Prerequisites?¶
Installation of Pandas and an engine such as 'pyarrow' (conda install pyarrow -c conda-forge) on the running environment.
Useful links:¶
- https://parquet.apache.org/
- https://en.wikipedia.org/wiki/Column-oriented_DBMS
- https://datascience.stackexchange.com/questions/8244/what-makes-columnar-databases-suitable-for-data-science/9242
- https://www.datacamp.com/community/tutorials/pickle-python-tutorial (Section of interest: When Not to Use pickle)