Decluttering my mind into the web ...
date posted: 2021-Mar-19, last edit date: 2021-Mar-19
Note: Most of the time, loading data using pandas will be just fine. Sometimes however (like when you are dealing with dirty data), more control is needed. The below code is intended as a reference for such situations.
import wget
url = 'https://generalspace.sgp1.digitaloceanspaces.com/personalWebsite_S3/media/blog/files/nbk.csv'
wget.download(url, 'nbk.csv') # download the file into current directory
'nbk.csv'
import os.path
import time
from datetime import datetime as dt
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f %s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f %s%s" % (num, 'Yi', suffix)
def fileLineCount(filePath):
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
lineCounter = 0
for line in fileHandle: lineCounter+=1
fileHandle.close()
return lineCounter
def fileHead(filePath, n=5):
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
lineCounter = 0
for line in fileHandle:
if lineCounter >= n: break
else:
print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
lineCounter+=1
fileHandle.close()
def fileTail(filePath, n=5):
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
totalLines = fileLineCount(filePath)
if n > totalLines:
print('Error: number of lines required cannot be greater than total lines in file.')
fileHandle.close()
return
lineCounter = 0
for line in fileHandle:
if lineCounter in range(totalLines-n, totalLines):
print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
lineCounter+=1
fileHandle.close()
myFile = r'nbk.csv' # change here
print('\nFile details')
print('------------')
print('File : \t', myFile)
print('Created : \t', dt.strptime(time.ctime(os.path.getctime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Modified : \t', dt.strptime(time.ctime(os.path.getmtime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Accessed : \t', dt.strptime(time.ctime(os.path.getatime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Size : \t', sizeof_fmt(os.path.getsize(myFile)))
print('Line count : \t {lineCount:,}'.format(lineCount=fileLineCount(myFile)))
print('\n\nFile preview')
print('------------')
fileHead(myFile)
print('...')
fileTail(myFile)
File details ------------ File : nbk.csv Created : 2021-03-19 16:38:27 Modified : 2021-03-19 16:38:27 Accessed : 2021-03-19 16:38:26 Size : 105.4 KiB Line count : 2,465 File preview ------------ Line 0: date,price,open,high,low,vol,change Line 1: Nov 14 2019,968,973,973,967,1719234,-0.51% Line 2: Nov 13 2019,973,970,974,970,5404608,0.10% Line 3: Nov 12 2019,972,967,974,967,4525303,0.31% Line 4: Nov 11 2019,969,963,971,962,3996610,0.73% ... Line 2,460: Jan 10 2010,683,683,683,670,147681,0.00% Line 2,461: Jan 07 2010,683,670,683,658,2272648,3.80% Line 2,462: Jan 06 2010,658,658,670,658,1062484,-1.79% Line 2,463: Jan 05 2010,670,658,670,646,1895241,1.82% Line 2,464: Jan 04 2010,658,658,658,646,1136324,-3.66%
filePath = myFile
tableName = 'test_table' # change here
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
for line in fileHandle:
break
fileHandle.close()
myTuple = line.split(',')
print('\nTable name to be used : {tableName}\n'.format(tableName=tableName))
for i in range(0, len(myTuple)):
if i != 0: print(',', end='')
print('"{attr}"'.format(attr=myTuple[i].strip().lower()), end='')
print('\n')
for i in range(0, len(myTuple)):
if i != 0: print(',', end='')
print('{{{attr}}}'.format(attr=myTuple[i].strip().lower()), end='')
print('\n')
for i in range(0, len(myTuple)):
if i != 0: print(',', end='')
print('{attr}=myTuple[{i}]'.format(attr=myTuple[i].strip().lower(), i=i))
print('\n')
Table name to be used : test_table "date","price","open","high","low","vol","change" {date},{price},{open},{high},{low},{vol},{change} date=myTuple[0] ,price=myTuple[1] ,open=myTuple[2] ,high=myTuple[3] ,low=myTuple[4] ,vol=myTuple[5] ,change=myTuple[6]
##############
# CREATE TABLE
##############
import psycopg2
con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'") # change here
cur = con.cursor()
# create table
# use the output from the previous step to adjust the below
# put the appropriate data types
queryString = """
drop table if exists {tableName};
create table {tableName} (
"date" date
,"price" float
,"open" float
,"high" float
,"low" float
,"vol" float
,"change" float
);
""".format(tableName=tableName)
cur.execute(queryString)
con.commit()
# close connection
con.close()
print('Table "{tableName}" created.'.format(tableName=tableName))
Table "test_table" created.
"""REFERENCE
-------------------
strptime / stftime:
-------------------
resource: https://www.programiz.com/python-programming/datetime/strptime
year -- %Y
month (name) -- %b -- %B
month -- %m -- %-m
day -- %d -- %-d
hour -- %H -- %-H
minute -- %M -- %-M
seconds -- %S -- %-S
--------
strings:
--------
string methods: dir(str)
string formatting: https://docs.python.org/3/library/string.html#formatspec
------
round:
------
round(5.76543, 2)
"""
con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
filePath = myFile
# load data
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
lineCounter = 0
for line in fileHandle:
print ("Working on line : {i:<,d}".format(i=lineCounter), end='\r', flush=True)
# skip header
if lineCounter != 0:
# specify delimeter
myTuple = line.split(',')
# fix nulls, if any
for i in range(0, len(myTuple)):
if len(myTuple[i]) == 0: myTuple[i] = 'null'
# update here, step 2 output can be used
queryString = 'insert into {tableName} ("date","price","open","high","low","vol","change") values (\'{date}\',{price},{open},{high},{low},{vol},{change});'.format(
date=dt.strptime(myTuple[0].strip(), "%b %d %Y").strftime("%Y-%m-%d %H:%M:%S")
,price=myTuple[1]
,open=myTuple[2]
,high=myTuple[3]
,low=myTuple[4]
,vol=myTuple[5]
,change=round(float(myTuple[6].replace('%', ''))/100, 5)
,tableName=tableName
)
cur.execute(queryString)
lineCounter += 1
con.commit()
fileHandle.close()
# read number of records in database to make sure it's a match
cur.execute("select count(*) from {tableName};".format(tableName=tableName))
print("\nNumber of records in table \"{tableName}\" : {recordCount:<,d}".format(tableName=tableName, recordCount=cur.fetchall()[0][0], ))
# close connection
con.close()
Working on line : 2,464 Number of records in table "test_table" : 2,464
import pandas as pd
con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
query = 'select * from test_table;'
df = pd.read_sql_query(query, con)
# close connection
con.close()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2464 entries, 0 to 2463 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 2464 non-null object 1 price 2464 non-null float64 2 open 2464 non-null float64 3 high 2464 non-null float64 4 low 2464 non-null float64 5 vol 2464 non-null float64 6 change 2464 non-null float64 dtypes: float64(6), object(1) memory usage: 134.9+ KB
df.head()
date | price | open | high | low | vol | change | |
---|---|---|---|---|---|---|---|
0 | 2019-11-14 | 968.0 | 973.0 | 973.0 | 967.0 | 1719234.0 | -0.0051 |
1 | 2019-11-13 | 973.0 | 970.0 | 974.0 | 970.0 | 5404608.0 | 0.0010 |
2 | 2019-11-12 | 972.0 | 967.0 | 974.0 | 967.0 | 4525303.0 | 0.0031 |
3 | 2019-11-11 | 969.0 | 963.0 | 971.0 | 962.0 | 3996610.0 | 0.0073 |
4 | 2019-11-10 | 962.0 | 955.0 | 976.0 | 955.0 | 15996293.0 | 0.0278 |
df.tail()
date | price | open | high | low | vol | change | |
---|---|---|---|---|---|---|---|
2459 | 2010-01-10 | 683.0 | 683.0 | 683.0 | 670.0 | 147681.0 | 0.0000 |
2460 | 2010-01-07 | 683.0 | 670.0 | 683.0 | 658.0 | 2272648.0 | 0.0380 |
2461 | 2010-01-06 | 658.0 | 658.0 | 670.0 | 658.0 | 1062484.0 | -0.0179 |
2462 | 2010-01-05 | 670.0 | 658.0 | 670.0 | 646.0 | 1895241.0 | 0.0182 |
2463 | 2010-01-04 | 658.0 | 658.0 | 658.0 | 646.0 | 1136324.0 | -0.0366 |
# | Post Title | Date Posted | Last Edit Date |
---|---|---|---|
1 | New tools for data analysis (ipython-sql and Azure Data Studio) | 2021-10-16 | 2021-10-16 |
2 | How to setup a remote jupyter server | 2021-10-13 | 2021-10-13 |
3 | How to setup ZSH | 2021-10-12 | 2021-10-12 |
4 | Concepts in Data Architecture | 2021-05-01 | 2021-10-16 |
5 | current post -- Loading data into PostgreSQL reference | 2021-03-19 | 2021-03-19 |
6 | SQLite <--> Pandas reference | 2021-03-15 | 2021-03-15 |
7 | Creating an automated data mart (in SQL) | 2020-07-16 | 2020-07-16 |
8 | Personal finance management and open banking | 2020-07-12 | 2020-07-12 |
9 | Quick SQL reference | 2020-07-03 | 2020-11-29 |
10 | How to back up Digital Ocean Spaces (S3 compatible storage) | 2020-06-07 | 2020-07-03 |
11 | PostgreSQL initiation | 2020-05-25 | 2020-06-01 |