Hyper-Parameter Tuning#

Optimization and fine-tuning for the hyper-parameters using a novel framework named Optuna.

Install#

pyJedAI is an open-source library that can be installed from PyPI.

%pip install pyjedai -U
%pip show pyjedai

Imports

import plotly.express as px
import logging
import sys
import optuna
import plotly
import os
import sys
import pandas as pd
from optuna.visualization import *
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

Data Reading#

from pyjedai.datamodel import Data

data = Data(
    dataset_1=pd.read_csv("./../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str),
    attributes_1=['id','name','description'],
    id_column_name_1='id',
    dataset_2=pd.read_csv("./../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str),
    attributes_2=['id','name','description'],
    id_column_name_2='id',
    ground_truth=pd.read_csv("./../data/ccer/D2/gt.csv", sep='|', engine='python'),
)

WorkFlow#

from pyjedai.workflow import WorkFlow, compare_workflows
from pyjedai.block_building import StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking
from pyjedai.block_cleaning import BlockFiltering, BlockPurging
from pyjedai.comparison_cleaning import WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning, CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning, ReciprocalWeightedNodePruning, ComparisonPropagation
from pyjedai.matching import EntityMatching
from pyjedai.clustering import ConnectedComponentsClustering
db_name = "pyjedai"
title = "Test"
storage_name = "sqlite:///{}.db".format(db_name)
study_name = title  # Unique identifier of the study.

Objective function#

In the bellow cell, we define which parameters we want to be fine-tuned and the boundaries that we suggest. Also we set as the goal score to be maximized the F1-Score.

'''
 OPTUNA objective function
'''
def objective(trial):
    
    w = WorkFlow(
        block_building = dict(
            method=QGramsBlocking, 
            params=dict(qgrams=trial.suggest_int("qgrams", 3, 10)),
            attributes_1=['name'],
            attributes_2=['name']
        ),
        block_cleaning = [
            dict(
                method=BlockPurging,
                params=dict(smoothing_factor=1.025)
            ),
            dict(
                method=BlockFiltering, 
                params=dict(
                    ratio = trial.suggest_float("ratio", 0.7, 0.95)
                )
            )
        ],
        comparison_cleaning = dict(method=CardinalityEdgePruning),
            entity_matching = dict(
            method=EntityMatching, 
            metric='sorensen_dice',
            similarity_threshold= trial.suggest_float("similarity_threshold", 0.05, 0.9),
            attributes = ['description', 'name']
        ),
        clustering = dict(method=ConnectedComponentsClustering),
        name="Worflow-Test"
    )
    w.run(data, workflow_step_tqdm_disable=True, verbose=False)
    f1, precision, recall = w.get_final_scores()
    
    return f1
study_name = title  # Unique identifier of the study.
num_of_trials = 30
study = optuna.create_study(
    directions=["maximize"],
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True
)
print("Optuna trials starting")
study.optimize(
    objective, 
    n_trials=num_of_trials, 
    show_progress_bar=True
)
print("Optuna trials finished")
[I 2022-09-26 17:11:56,515] A new study created in RDB with name: Test
Optuna trials starting
C:\Users\nikol\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\progress_bar.py:49: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()
[I 2022-09-26 17:12:08,614] Trial 0 finished with value: 0.30337436666113177 and parameters: {'qgrams': 8, 'ratio': 0.8380947452182991, 'similarity_threshold': 0.34701140984689427}. Best is trial 0 with value: 0.30337436666113177.
[I 2022-09-26 17:12:30,648] Trial 1 finished with value: 0.20307681243216183 and parameters: {'qgrams': 5, 'ratio': 0.7929630924927731, 'similarity_threshold': 0.3138589895822442}. Best is trial 0 with value: 0.30337436666113177.
[I 2022-09-26 17:12:53,415] Trial 2 finished with value: 0.19103604207409036 and parameters: {'qgrams': 4, 'ratio': 0.8038691888459086, 'similarity_threshold': 0.1331382386125572}. Best is trial 0 with value: 0.30337436666113177.
[I 2022-09-26 17:12:58,756] Trial 3 finished with value: 0.28333512688101153 and parameters: {'qgrams': 7, 'ratio': 0.7144467000567123, 'similarity_threshold': 0.38959392590704467}. Best is trial 0 with value: 0.30337436666113177.
[I 2022-09-26 17:13:03,935] Trial 4 finished with value: 0.4633111426794054 and parameters: {'qgrams': 10, 'ratio': 0.8517624194151302, 'similarity_threshold': 0.1658021229910926}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:13:26,462] Trial 5 finished with value: 0.18531875170393172 and parameters: {'qgrams': 3, 'ratio': 0.9174470432736699, 'similarity_threshold': 0.8777133320453102}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:13:38,772] Trial 6 finished with value: 0.1907552827778649 and parameters: {'qgrams': 4, 'ratio': 0.7808721328696897, 'similarity_threshold': 0.10683080190597335}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:13:43,365] Trial 7 finished with value: 0.33330840997266736 and parameters: {'qgrams': 9, 'ratio': 0.800827464931477, 'similarity_threshold': 0.40948711496314116}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:13:47,298] Trial 8 finished with value: 0.39784787794552795 and parameters: {'qgrams': 9, 'ratio': 0.7395142458665667, 'similarity_threshold': 0.830695162394687}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:14:04,391] Trial 9 finished with value: 0.1862693152521443 and parameters: {'qgrams': 3, 'ratio': 0.8596838078185782, 'similarity_threshold': 0.056875572246384}. Best is trial 4 with value: 0.4633111426794054.
[I 2022-09-26 17:14:08,972] Trial 10 finished with value: 0.6482633708392243 and parameters: {'qgrams': 10, 'ratio': 0.8957263433574094, 'similarity_threshold': 0.630359849425508}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:13,289] Trial 11 finished with value: 0.5274133516352982 and parameters: {'qgrams': 10, 'ratio': 0.9045625386867897, 'similarity_threshold': 0.6392297322807924}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:18,068] Trial 12 finished with value: 0.42621654591235925 and parameters: {'qgrams': 10, 'ratio': 0.9483018131894073, 'similarity_threshold': 0.669716499745008}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:23,897] Trial 13 finished with value: 0.22112929238626436 and parameters: {'qgrams': 7, 'ratio': 0.8961116303610029, 'similarity_threshold': 0.6036793964995847}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:28,293] Trial 14 finished with value: 0.4070833316320489 and parameters: {'qgrams': 9, 'ratio': 0.8983635553727757, 'similarity_threshold': 0.5808143891387648}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:33,109] Trial 15 finished with value: 0.42621654591235925 and parameters: {'qgrams': 10, 'ratio': 0.9488029595469605, 'similarity_threshold': 0.7454617001392468}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:40,257] Trial 16 finished with value: 0.2123846817548284 and parameters: {'qgrams': 6, 'ratio': 0.8777239752940389, 'similarity_threshold': 0.5114877692227677}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:45,396] Trial 17 finished with value: 0.3258435026582389 and parameters: {'qgrams': 8, 'ratio': 0.9204153410113451, 'similarity_threshold': 0.7435281257384602}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:50,167] Trial 18 finished with value: 0.3480318083295157 and parameters: {'qgrams': 8, 'ratio': 0.875183707875409, 'similarity_threshold': 0.5035164496571632}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:14:54,146] Trial 19 finished with value: 0.36391860524716446 and parameters: {'qgrams': 10, 'ratio': 0.8294161917860436, 'similarity_threshold': 0.6808222011656786}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:01,379] Trial 20 finished with value: 0.20993148042255388 and parameters: {'qgrams': 6, 'ratio': 0.9196762030928591, 'similarity_threshold': 0.5795471650862246}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:05,584] Trial 21 finished with value: 0.46330684097155167 and parameters: {'qgrams': 10, 'ratio': 0.8563838420348091, 'similarity_threshold': 0.23323189289645876}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:09,902] Trial 22 finished with value: 0.4070833316320489 and parameters: {'qgrams': 9, 'ratio': 0.8948541483847852, 'similarity_threshold': 0.2644175908762265}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:14,091] Trial 23 finished with value: 0.42745701777773953 and parameters: {'qgrams': 10, 'ratio': 0.8429055971815752, 'similarity_threshold': 0.4726717116974519}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:18,489] Trial 24 finished with value: 0.40750648259180683 and parameters: {'qgrams': 9, 'ratio': 0.876132015812083, 'similarity_threshold': 0.6726766450815258}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:23,589] Trial 25 finished with value: 0.3258414249026382 and parameters: {'qgrams': 8, 'ratio': 0.9275690010126776, 'similarity_threshold': 0.16506027753408292}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:27,909] Trial 26 finished with value: 0.46330684097155167 and parameters: {'qgrams': 10, 'ratio': 0.8601034741266624, 'similarity_threshold': 0.7695051840189288}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:31,804] Trial 27 finished with value: 0.4037641499510466 and parameters: {'qgrams': 9, 'ratio': 0.7662074140900922, 'similarity_threshold': 0.44798780615040656}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:36,144] Trial 28 finished with value: 0.3639219361818585 and parameters: {'qgrams': 10, 'ratio': 0.8174855005418883, 'similarity_threshold': 0.6110684353074003}. Best is trial 10 with value: 0.6482633708392243.
[I 2022-09-26 17:15:43,271] Trial 29 finished with value: 0.24080227825457184 and parameters: {'qgrams': 7, 'ratio': 0.8389446905059951, 'similarity_threshold': 0.35743796412615136}. Best is trial 10 with value: 0.6482633708392243.
Optuna trials finished

Optuna Visualizations#

study.trials_dataframe(attrs=("number", "value", "params", "state"))
number value params_qgrams params_ratio params_similarity_threshold state
0 0 0.303374 8 0.838095 0.347011 COMPLETE
1 1 0.203077 5 0.792963 0.313859 COMPLETE
2 2 0.191036 4 0.803869 0.133138 COMPLETE
3 3 0.283335 7 0.714447 0.389594 COMPLETE
4 4 0.463311 10 0.851762 0.165802 COMPLETE
5 5 0.185319 3 0.917447 0.877713 COMPLETE
6 6 0.190755 4 0.780872 0.106831 COMPLETE
7 7 0.333308 9 0.800827 0.409487 COMPLETE
8 8 0.397848 9 0.739514 0.830695 COMPLETE
9 9 0.186269 3 0.859684 0.056876 COMPLETE
10 10 0.648263 10 0.895726 0.630360 COMPLETE
11 11 0.527413 10 0.904563 0.639230 COMPLETE
12 12 0.426217 10 0.948302 0.669716 COMPLETE
13 13 0.221129 7 0.896112 0.603679 COMPLETE
14 14 0.407083 9 0.898364 0.580814 COMPLETE
15 15 0.426217 10 0.948803 0.745462 COMPLETE
16 16 0.212385 6 0.877724 0.511488 COMPLETE
17 17 0.325844 8 0.920415 0.743528 COMPLETE
18 18 0.348032 8 0.875184 0.503516 COMPLETE
19 19 0.363919 10 0.829416 0.680822 COMPLETE
20 20 0.209931 6 0.919676 0.579547 COMPLETE
21 21 0.463307 10 0.856384 0.233232 COMPLETE
22 22 0.407083 9 0.894854 0.264418 COMPLETE
23 23 0.427457 10 0.842906 0.472672 COMPLETE
24 24 0.407506 9 0.876132 0.672677 COMPLETE
25 25 0.325841 8 0.927569 0.165060 COMPLETE
26 26 0.463307 10 0.860103 0.769505 COMPLETE
27 27 0.403764 9 0.766207 0.447988 COMPLETE
28 28 0.363922 10 0.817486 0.611068 COMPLETE
29 29 0.240802 7 0.838945 0.357438 COMPLETE
fig = plot_optimization_history(study)
fig.show()
fig = plot_parallel_coordinate(study)
fig.show()
fig = plot_parallel_coordinate(study, params=["qgrams"])
fig.show()
fig = plot_contour(study)
fig.show()
fig = plot_contour(study, params=["qgrams", "ratio"])
fig.show()
fig = plot_slice(study,  params=["qgrams", "ratio"])
fig.show()
fig = plot_slice(study,  params=["qgrams", "ratio"])
fig.show()
fig = plot_param_importances(study)
fig.show()
fig = plot_edf(study)
fig.show()
fig = optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)
fig.show()

K. Nikoletos, J. Maciejewski, G. Papadakis & M. Koubarakis