Data Reading examples#

pyJedAI needs as input a pandas.DataFrame. In this notebook we provide some examples of data reading and transformation to DataFrame.

reading-process.jpg

import pandas as pd

CSV Reader#

Example Dataset: CORA

d1 = pd.read_csv("../data/der/cora/cora.csv", sep='|')
gt = pd.read_csv("../data/der/cora/cora_gt.csv", sep='|', header=None)
d1.head(1)
Entity Id address author editor institution month note pages publisher title venue volume year Unnamed: 13
0 0 los alamitos, ca: p. auer, n. cesa-bianchi, y. freund, and r. e.... NaN NaN NaN NaN pp. 322-331. ieee computer society press, 'gambling in a rigged casino: the adversarial ... in proc. 36th annual symposium on foundations ... NaN 1995, NaN

JSON Reader#

d1 = pd.read_json("../data/der/cora/cora.json")
gt = pd.read_json("../data/der/cora/cora_gt.json")
d1.head(1)
Entity Id address author editor institution month note pages publisher title venue volume year Unnamed: 13
0 0 los alamitos, ca: p. auer, n. cesa-bianchi, y. freund, and r. e.... None None None None pp. 322-331. ieee computer society press, 'gambling in a rigged casino: the adversarial ... in proc. 36th annual symposium on foundations ... None 1995, NaN

Excel Reader#

d1 = pd.read_excel("../data/der/cora/cora.xlsx")
gt = pd.read_excel("../data/der/cora/cora_gt.xlsx")
d1.head(1)
Unnamed: 0 Entity Id address author editor institution month note pages publisher title venue volume year Unnamed: 13
0 0 0 los alamitos, ca: p. auer, n. cesa-bianchi, y. freund, and r. e.... NaN NaN NaN NaN pp. 322-331. ieee computer society press, 'gambling in a rigged casino: the adversarial ... in proc. 36th annual symposium on foundations ... NaN 1995, NaN

RDF/OWL Reader#

import rdfpandas as rfd
import rdflib

rdfd1 = rdflib.Graph().parse('../data/rdf/restaurants/restaurant1.nt')
rdfd2 = rdflib.Graph().parse('../data/rdf/restaurants/restaurant2.nt')

def rdf_to_df(graph_parsed) -> pd.DataFrame:
    subject = []
    predicate = []
    rdfobject = []
    df = pd.DataFrame(columns=['subject', 'predicate', 'object'])
    for s, p, o in graph_parsed:
        subject.append(s)
        predicate.append(p)
        rdfobject.append(o)
    df['predicate'] = predicate
    df['subject'] = subject
    df['object'] = rdfobject

    return df
    
d1 = rdf_to_df(rdfd1)
d2 = rdf_to_df(rdfd2)
d1.head(2)
d2.head(2)

Relational DBs Reader#

from sqlite3 import connect
conn = connect(':memory:')
d1.to_sql('d1', conn)
d2.to_sql('d2', conn)
gt.to_sql('gt', conn)
sql_d1 = pd.read_sql('SELECT * FROM d1', conn)
sql_d2 = pd.read_sql('SELECT * FROM d2', conn)
sql_gt = pd.read_sql('SELECT * FROM gt', conn)
sql_d1.head(1)

PostgreSQL#

from sqlalchemy import create_engine

POSTGRES_ADDRESS = 'db' ## INSERT YOUR DB ADDRESS
POSTGRES_PORT = '5439'
POSTGRES_USERNAME = 'username' ## CHANGE THIS TO YOUR POSTGRES USERNAME
POSTGRES_PASSWORD = 'root' ## CHANGE THIS TO YOUR POSTGRES PASSWORD
POSTGRES_DBNAME = 'database' ## CHANGE THIS TO YOUR DATABASE NAME
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(
    username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname=POSTGRES_DBNAME
))

# Create the connection
cnx = create_engine(postgres_str)
pd.read_sql('SELECT * FROM d1', cnx)

SPARKQL Reader#

from pandas import json_normalize
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
        SELECT *
        WHERE
        {
          ?athlete  rdfs:label      "Cristiano Ronaldo"@en ;
                    dbo:birthPlace  ?place .
         ?place     a               dbo:City ;
                    rdfs:label      ?cityName .
        }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
d1 = json_normalize(results["results"]["bindings"])
d1