import wget

url = 'https://generalspace.sgp1.digitaloceanspaces.com/personalWebsite_S3/media/blog/files/nbk.csv'
wget.download(url, 'nbk.csv') # download the file into current directory

'nbk.csv'


import os.path
import time
from datetime import datetime as dt

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def fileLineCount(filePath):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    lineCounter = 0
    for line in fileHandle: lineCounter+=1
    fileHandle.close()
    return lineCounter

def fileHead(filePath, n=5):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    lineCounter = 0
    for line in fileHandle: 
        if lineCounter >= n: break
        else:
            print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
            lineCounter+=1
    fileHandle.close()
    
def fileTail(filePath, n=5):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    totalLines = fileLineCount(filePath)
    if n > totalLines:
        print('Error: number of lines required cannot be greater than total lines in file.')
        fileHandle.close()
        return
    lineCounter = 0
    for line in fileHandle: 
        if lineCounter in range(totalLines-n, totalLines):
            print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
        lineCounter+=1
    fileHandle.close()


myFile = r'nbk.csv' # change here 

print('\nFile details')
print('------------')
print('File       : \t', myFile)
print('Created    : \t', dt.strptime(time.ctime(os.path.getctime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Modified   : \t', dt.strptime(time.ctime(os.path.getmtime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Accessed   : \t', dt.strptime(time.ctime(os.path.getatime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Size       : \t', sizeof_fmt(os.path.getsize(myFile)))
print('Line count : \t {lineCount:,}'.format(lineCount=fileLineCount(myFile)))

print('\n\nFile preview')
print('------------')
fileHead(myFile)
print('...')
fileTail(myFile)

File details
------------
File       : 	 nbk.csv
Created    : 	 2021-03-19 16:38:27
Modified   : 	 2021-03-19 16:38:27
Accessed   : 	 2021-03-19 16:38:26
Size       : 	 105.4 KiB
Line count : 	 2,465


File preview
------------
Line 0: 	 date,price,open,high,low,vol,change
Line 1: 	 Nov 14 2019,968,973,973,967,1719234,-0.51%
Line 2: 	 Nov 13 2019,973,970,974,970,5404608,0.10%
Line 3: 	 Nov 12 2019,972,967,974,967,4525303,0.31%
Line 4: 	 Nov 11 2019,969,963,971,962,3996610,0.73%
...
Line 2,460: 	 Jan 10 2010,683,683,683,670,147681,0.00%
Line 2,461: 	 Jan 07 2010,683,670,683,658,2272648,3.80%
Line 2,462: 	 Jan 06 2010,658,658,670,658,1062484,-1.79%
Line 2,463: 	 Jan 05 2010,670,658,670,646,1895241,1.82%
Line 2,464: 	 Jan 04 2010,658,658,658,646,1136324,-3.66%


filePath = myFile
tableName = 'test_table' # change here

try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
for line in fileHandle: 
    break
fileHandle.close()

myTuple = line.split(',')

print('\nTable name to be used : {tableName}\n'.format(tableName=tableName))

for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('"{attr}"'.format(attr=myTuple[i].strip().lower()), end='')
    
print('\n')
    
for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('{{{attr}}}'.format(attr=myTuple[i].strip().lower()), end='')
    
print('\n')
    
for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('{attr}=myTuple[{i}]'.format(attr=myTuple[i].strip().lower(), i=i))

print('\n')

Table name to be used : test_table

"date","price","open","high","low","vol","change"

{date},{price},{open},{high},{low},{vol},{change}

date=myTuple[0]
,price=myTuple[1]
,open=myTuple[2]
,high=myTuple[3]
,low=myTuple[4]
,vol=myTuple[5]
,change=myTuple[6]


##############
# CREATE TABLE
##############

import psycopg2

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'") # change here
cur = con.cursor()

# create table
# use the output from the previous step to adjust the below
# put the appropriate data types
queryString = """
drop table if exists {tableName};
create table {tableName} (
     "date" date
    ,"price" float
    ,"open" float
    ,"high" float
    ,"low" float
    ,"vol" float
    ,"change" float
);
""".format(tableName=tableName)
cur.execute(queryString)
con.commit()

# close connection
con.close()

print('Table "{tableName}" created.'.format(tableName=tableName))

Table "test_table" created.


"""REFERENCE



-------------------
strptime / stftime:
-------------------

resource: https://www.programiz.com/python-programming/datetime/strptime

year         -- %Y
month (name) -- %b -- %B
month        -- %m -- %-m
day          -- %d -- %-d

hour         -- %H -- %-H
minute       -- %M -- %-M
seconds      -- %S -- %-S



--------
strings:
--------

string methods: dir(str)

string formatting: https://docs.python.org/3/library/string.html#formatspec



------
round:
------

round(5.76543, 2)



"""

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
filePath = myFile

# load data 
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
lineCounter = 0
for line in fileHandle: 
    
    print ("Working on line : {i:<,d}".format(i=lineCounter), end='\r', flush=True)
    
    # skip header
    if lineCounter != 0:
    
        # specify delimeter
        myTuple = line.split(',')
        
        # fix nulls, if any
        for i in range(0, len(myTuple)):
            if len(myTuple[i]) == 0: myTuple[i] = 'null'
        
        # update here, step 2 output can be used
        queryString = 'insert into {tableName} ("date","price","open","high","low","vol","change") values (\'{date}\',{price},{open},{high},{low},{vol},{change});'.format(
             date=dt.strptime(myTuple[0].strip(), "%b %d %Y").strftime("%Y-%m-%d %H:%M:%S")
            ,price=myTuple[1]
            ,open=myTuple[2]
            ,high=myTuple[3]
            ,low=myTuple[4]
            ,vol=myTuple[5]
            ,change=round(float(myTuple[6].replace('%', ''))/100, 5)
            ,tableName=tableName
        )
        cur.execute(queryString)
        
    lineCounter += 1

con.commit()
fileHandle.close()

# read number of records in database to make sure it's a match
cur.execute("select count(*) from {tableName};".format(tableName=tableName))
print("\nNumber of records in table \"{tableName}\" : {recordCount:<,d}".format(tableName=tableName, recordCount=cur.fetchall()[0][0], ))

# close connection
con.close()

Working on line : 2,464
Number of records in table "test_table" : 2,464


import pandas as pd

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
query = 'select * from test_table;'
df = pd.read_sql_query(query, con)

# close connection
con.close()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2464 entries, 0 to 2463
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    2464 non-null   object 
 1   price   2464 non-null   float64
 2   open    2464 non-null   float64
 3   high    2464 non-null   float64
 4   low     2464 non-null   float64
 5   vol     2464 non-null   float64
 6   change  2464 non-null   float64
dtypes: float64(6), object(1)
memory usage: 134.9+ KB


df.head()


df.tail()

	date	price	open	high	low	vol	change
0	2019-11-14	968.0	973.0	973.0	967.0	1719234.0	-0.0051
1	2019-11-13	973.0	970.0	974.0	970.0	5404608.0	0.0010
2	2019-11-12	972.0	967.0	974.0	967.0	4525303.0	0.0031
3	2019-11-11	969.0	963.0	971.0	962.0	3996610.0	0.0073
4	2019-11-10	962.0	955.0	976.0	955.0	15996293.0	0.0278

	date	price	open	high	low	vol	change
2459	2010-01-10	683.0	683.0	683.0	670.0	147681.0	0.0000
2460	2010-01-07	683.0	670.0	683.0	658.0	2272648.0	0.0380
2461	2010-01-06	658.0	658.0	670.0	658.0	1062484.0	-0.0179
2462	2010-01-05	670.0	658.0	670.0	646.0	1895241.0	0.0182
2463	2010-01-04	658.0	658.0	658.0	646.0	1136324.0	-0.0366

#	Post Title	Date Posted	Last Edit Date
1	New tools for data analysis (ipython-sql and Azure Data Studio)	2021-10-16	2021-10-16
2	How to setup a remote jupyter server	2021-10-13	2021-10-13
3	How to setup ZSH	2021-10-12	2021-10-12
4	Concepts in Data Architecture	2021-05-01	2021-10-16
5	current post -- Loading data into PostgreSQL reference	2021-03-19	2021-03-19
6	SQLite <--> Pandas reference	2021-03-15	2021-03-15
7	Creating an automated data mart (in SQL)	2020-07-16	2020-07-16
8	Personal finance management and open banking	2020-07-12	2020-07-12
9	Quick SQL reference	2020-07-03	2020-11-29
10	How to back up Digital Ocean Spaces (S3 compatible storage)	2020-06-07	2020-07-03
11	PostgreSQL initiation	2020-05-25	2020-06-01

Loading data into PostgreSQL reference

Step 0 (optional): get file from the web

Step 1: Show file details and preview data

Step 2: Prepare to load data into DB

Step 3: Create table

Step 4: Load data and monitor progress

Step 5: Check loaded data