Abdulrahman AlQallaf

Decluttering my mind into the web ...







Loading data into PostgreSQL reference

date posted: 2021-Mar-19, last edit date: 2026-Feb-02


Note: Most of the time, loading data using pandas will be just fine. Sometimes however (like when you are dealing with dirty data), more control is needed. The below code is intended as a reference for such situations.

Step 0 (optional): get file from the web

import wget

url = '/media/blog/files/python-tutorials/nbk.csv'
wget.download(url, 'nbk.csv') # download the file into current directory
'nbk.csv'

Step 1: Show file details and preview data

import os.path
import time
from datetime import datetime as dt

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def fileLineCount(filePath):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    lineCounter = 0
    for line in fileHandle: lineCounter+=1
    fileHandle.close()
    return lineCounter

def fileHead(filePath, n=5):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    lineCounter = 0
    for line in fileHandle: 
        if lineCounter >= n: break
        else:
            print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
            lineCounter+=1
    fileHandle.close()
    
def fileTail(filePath, n=5):
    try: fileHandle = open(filePath, encoding='utf-8')
    except: print('File cannot be opened.')
    totalLines = fileLineCount(filePath)
    if n > totalLines:
        print('Error: number of lines required cannot be greater than total lines in file.')
        fileHandle.close()
        return
    lineCounter = 0
    for line in fileHandle: 
        if lineCounter in range(totalLines-n, totalLines):
            print('Line {lineCounter:,}: \t {line}'.format(lineCounter=lineCounter, line=line), end='')
        lineCounter+=1
    fileHandle.close()


myFile = r'nbk.csv' # change here 

print('\nFile details')
print('------------')
print('File       : \t', myFile)
print('Created    : \t', dt.strptime(time.ctime(os.path.getctime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Modified   : \t', dt.strptime(time.ctime(os.path.getmtime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Accessed   : \t', dt.strptime(time.ctime(os.path.getatime(myFile)), "%a %b %d %H:%M:%S %Y"))
print('Size       : \t', sizeof_fmt(os.path.getsize(myFile)))
print('Line count : \t {lineCount:,}'.format(lineCount=fileLineCount(myFile)))

print('\n\nFile preview')
print('------------')
fileHead(myFile)
print('...')
fileTail(myFile)
File details
------------
File       : 	 nbk.csv
Created    : 	 2021-03-19 16:38:27
Modified   : 	 2021-03-19 16:38:27
Accessed   : 	 2021-03-19 16:38:26
Size       : 	 105.4 KiB
Line count : 	 2,465


File preview
------------
Line 0: 	 date,price,open,high,low,vol,change
Line 1: 	 Nov 14 2019,968,973,973,967,1719234,-0.51%
Line 2: 	 Nov 13 2019,973,970,974,970,5404608,0.10%
Line 3: 	 Nov 12 2019,972,967,974,967,4525303,0.31%
Line 4: 	 Nov 11 2019,969,963,971,962,3996610,0.73%
...
Line 2,460: 	 Jan 10 2010,683,683,683,670,147681,0.00%
Line 2,461: 	 Jan 07 2010,683,670,683,658,2272648,3.80%
Line 2,462: 	 Jan 06 2010,658,658,670,658,1062484,-1.79%
Line 2,463: 	 Jan 05 2010,670,658,670,646,1895241,1.82%
Line 2,464: 	 Jan 04 2010,658,658,658,646,1136324,-3.66%

Step 2: Prepare to load data into DB

filePath = myFile
tableName = 'test_table' # change here

try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
for line in fileHandle: 
    break
fileHandle.close()

myTuple = line.split(',')

print('\nTable name to be used : {tableName}\n'.format(tableName=tableName))

for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('"{attr}"'.format(attr=myTuple[i].strip().lower()), end='')
    
print('\n')
    
for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('{{{attr}}}'.format(attr=myTuple[i].strip().lower()), end='')
    
print('\n')
    
for i in range(0, len(myTuple)):
    if i != 0: print(',', end='')
    print('{attr}=myTuple[{i}]'.format(attr=myTuple[i].strip().lower(), i=i))

print('\n')
Table name to be used : test_table

"date","price","open","high","low","vol","change"

{date},{price},{open},{high},{low},{vol},{change}

date=myTuple[0]
,price=myTuple[1]
,open=myTuple[2]
,high=myTuple[3]
,low=myTuple[4]
,vol=myTuple[5]
,change=myTuple[6]


Step 3: Create table

##############
# CREATE TABLE
##############

import psycopg2

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'") # change here
cur = con.cursor()

# create table
# use the output from the previous step to adjust the below
# put the appropriate data types
queryString = """
drop table if exists {tableName};
create table {tableName} (
     "date" date
    ,"price" float
    ,"open" float
    ,"high" float
    ,"low" float
    ,"vol" float
    ,"change" float
);
""".format(tableName=tableName)
cur.execute(queryString)
con.commit()

# close connection
con.close()

print('Table "{tableName}" created.'.format(tableName=tableName))
Table "test_table" created.

Step 4: Load data and monitor progress

"""REFERENCE



-------------------
strptime / stftime:
-------------------

resource: https://www.programiz.com/python-programming/datetime/strptime

year         -- %Y
month (name) -- %b -- %B
month        -- %m -- %-m
day          -- %d -- %-d

hour         -- %H -- %-H
minute       -- %M -- %-M
seconds      -- %S -- %-S



--------
strings:
--------

string methods: dir(str)

string formatting: https://docs.python.org/3/library/string.html#formatspec



------
round:
------

round(5.76543, 2)



"""

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
filePath = myFile

# load data 
try: fileHandle = open(filePath, encoding='utf-8')
except: print('File cannot be opened.')
lineCounter = 0
for line in fileHandle: 
    
    print ("Working on line : {i:<,d}".format(i=lineCounter), end='\r', flush=True)
    
    # skip header
    if lineCounter != 0:
    
        # specify delimeter
        myTuple = line.split(',')
        
        # fix nulls, if any
        for i in range(0, len(myTuple)):
            if len(myTuple[i]) == 0: myTuple[i] = 'null'
        
        # update here, step 2 output can be used
        queryString = 'insert into {tableName} ("date","price","open","high","low","vol","change") values (\'{date}\',{price},{open},{high},{low},{vol},{change});'.format(
             date=dt.strptime(myTuple[0].strip(), "%b %d %Y").strftime("%Y-%m-%d %H:%M:%S")
            ,price=myTuple[1]
            ,open=myTuple[2]
            ,high=myTuple[3]
            ,low=myTuple[4]
            ,vol=myTuple[5]
            ,change=round(float(myTuple[6].replace('%', ''))/100, 5)
            ,tableName=tableName
        )
        cur.execute(queryString)
        
    lineCounter += 1

con.commit()
fileHandle.close()

# read number of records in database to make sure it's a match
cur.execute("select count(*) from {tableName};".format(tableName=tableName))
print("\nNumber of records in table \"{tableName}\" : {recordCount:<,d}".format(tableName=tableName, recordCount=cur.fetchall()[0][0], ))

# close connection
con.close()
Working on line : 2,464
Number of records in table "test_table" : 2,464

Step 5: Check loaded data

import pandas as pd

con = psycopg2.connect("dbname='test_db' user='test_user' host='localhost' password='test_password'")
cur = con.cursor()
query = 'select * from test_table;'
df = pd.read_sql_query(query, con)

# close connection
con.close()

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2464 entries, 0 to 2463
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    2464 non-null   object 
 1   price   2464 non-null   float64
 2   open    2464 non-null   float64
 3   high    2464 non-null   float64
 4   low     2464 non-null   float64
 5   vol     2464 non-null   float64
 6   change  2464 non-null   float64
dtypes: float64(6), object(1)
memory usage: 134.9+ KB
df.head()
date price open high low vol change
0 2019-11-14 968.0 973.0 973.0 967.0 1719234.0 -0.0051
1 2019-11-13 973.0 970.0 974.0 970.0 5404608.0 0.0010
2 2019-11-12 972.0 967.0 974.0 967.0 4525303.0 0.0031
3 2019-11-11 969.0 963.0 971.0 962.0 3996610.0 0.0073
4 2019-11-10 962.0 955.0 976.0 955.0 15996293.0 0.0278
df.tail()
date price open high low vol change
2459 2010-01-10 683.0 683.0 683.0 670.0 147681.0 0.0000
2460 2010-01-07 683.0 670.0 683.0 658.0 2272648.0 0.0380
2461 2010-01-06 658.0 658.0 670.0 658.0 1062484.0 -0.0179
2462 2010-01-05 670.0 658.0 670.0 646.0 1895241.0 0.0182
2463 2010-01-04 658.0 658.0 658.0 646.0 1136324.0 -0.0366





# Post Title Date Posted Last Edit Date
1 [Medium Article] 12 Years in Data & AI: Lesson #2 — How Organizational Structure Determines Data & AI Success 2026-03-18 2026-03-18
2 [Medium Article] 12 Years in Data & AI: Lessons Organizations Keep Learning the Hard Way 2026-03-07 2026-03-11
3 [Medium Article] Kuwait's Addition to the FATF Grey List — Why Implementation Matters More Than Regulation 2026-02-18 2026-03-11
4 New tools for data analysis (ipython-sql and Azure Data Studio) 2021-10-16 2021-10-16
5 How to setup a remote jupyter server 2021-10-13 2026-02-02
6 How to setup ZSH 2021-10-12 2021-10-12
7 Concepts in Data Architecture 2021-05-01 2026-02-02
8 current post -- Loading data into PostgreSQL reference 2021-03-19 2026-02-02
9 SQLite <--> Pandas reference 2021-03-15 2021-03-15
10 Creating an automated data mart (in SQL) 2020-07-16 2020-07-16
11 Personal finance management and open banking 2020-07-12 2020-07-12
12 Quick SQL reference 2020-07-03 2026-02-02
13 How to back up Digital Ocean Spaces (S3 compatible storage) 2020-06-07 2020-07-03
14 PostgreSQL initiation 2020-05-25 2020-06-01