Duplicate test database

In [ ]:

In [1]:

Copied!

# create a duplicate database from ch2k (smallest db)
# create a duplicate database from ch2k (smallest db)

In [2]:

Copied!





%load_ext autoreload
%autoreload 2

import sys
import os
from pathlib import Path

# Add parent directory to path (works from any notebook in notebooks/)
# the repo_root should be the parent directory of the notebooks folder
current_dir = Path().resolve()
# Determine repo root
if current_dir.name == 'dod2k': repo_root = current_dir
elif current_dir.parent.name == 'dod2k': repo_root = current_dir.parent
else: raise Exception('Please review the repo root structure (see first cell).')

# Update cwd and path only if needed
if os.getcwd() != str(repo_root):
    os.chdir(repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print(f"Repo root: {repo_root}")
if str(os.getcwd())==str(repo_root):
    print(f"Working directory matches repo root. ")
%load_ext autoreload
%autoreload 2

import sys
import os
from pathlib import Path

# Add parent directory to path (works from any notebook in notebooks/)
# the repo_root should be the parent directory of the notebooks folder
current_dir = Path().resolve()
# Determine repo root
if current_dir.name == 'dod2k': repo_root = current_dir
elif current_dir.parent.name == 'dod2k': repo_root = current_dir.parent
else: raise Exception('Please review the repo root structure (see first cell).')

# Update cwd and path only if needed
if os.getcwd() != str(repo_root):
    os.chdir(repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print(f"Repo root: {repo_root}")
if str(os.getcwd())==str(repo_root):
    print(f"Working directory matches repo root. ")

Repo root: /home/jupyter-lluecke/dod2k_v2.0/dod2k
Working directory matches repo root.

In [3]:

Copied!





import pandas as pd
import numpy as np
import datetime

from dod2k_utilities import ut_functions as utf # contains utility functions
from dod2k_utilities import ut_duplicate_search as dup # contains utility functions
import pandas as pd
import numpy as np
import datetime

from dod2k_utilities import ut_functions as utf # contains utility functions
from dod2k_utilities import ut_duplicate_search as dup # contains utility functions

Load dataset¶

Define the dataset which needs to be screened for duplicates. Input files for the duplicate detection mechanism need to be compact dataframes (pandas dataframes with standardised columns and entry formatting).

The function load_compact_dataframe_from_csv loads the dataframe from a csv file from data\DB\, with DB the name of the database. The database name (db_name) can be

pages2k
ch2k
iso2k
sisal
fe23

for the individual databases, or

all_merged

to load the merged database of all individual databases, or can be any user defined compact dataframe.

In [4]:

Copied!





# load dataframe
db_name='all_merged' 
df = utf.load_compact_dataframe_from_csv(db_name)

print(df.info())
df.name = db_name
# load dataframe
db_name='all_merged' 
df = utf.load_compact_dataframe_from_csv(db_name)

print(df.info())
df.name = db_name

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5320 entries, 0 to 5319
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    5320 non-null   object 
 1   dataSetName                    5320 non-null   object 
 2   datasetId                      5320 non-null   object 
 3   geo_meanElev                   5221 non-null   float32
 4   geo_meanLat                    5320 non-null   float32
 5   geo_meanLon                    5320 non-null   float32
 6   geo_siteName                   5320 non-null   object 
 7   interpretation_direction       5320 non-null   object 
 8   interpretation_seasonality     5320 non-null   object 
 9   interpretation_variable        5320 non-null   object 
 10  interpretation_variableDetail  5320 non-null   object 
 11  originalDataURL                5320 non-null   object 
 12  originalDatabase               5320 non-null   object 
 13  paleoData_notes                5320 non-null   object 
 14  paleoData_proxy                5320 non-null   object 
 15  paleoData_sensorSpecies        5320 non-null   object 
 16  paleoData_units                5320 non-null   object 
 17  paleoData_values               5320 non-null   object 
 18  paleoData_variableName         5320 non-null   object 
 19  year                           5320 non-null   object 
 20  yearUnits                      5320 non-null   object 
dtypes: float32(3), object(18)
memory usage: 810.6+ KB
None

In [5]:

Copied!





m = 100
n=1

# Create synthetic duplicates
dupdf = pd.concat([df[:m]]*n)

# IMPORTANT: Reset index BEFORE the loop
dupdf = dupdf.reset_index(drop=True)  # This creates a clean 0-99 index

# Now create unique datasetIds
dupdf['datasetId'] = [f'{i}' for i in range(len(dupdf))]

# Or if you want to keep track of which are duplicates:
for i in range(len(dupdf)):
    original_idx = i % m 
    dupdf.loc[i, 'datasetId'] = f"record_{original_idx}_copy_{i//m}"

# Set index at the end if needed
dupdf.index = range(len(dupdf))#dupdf['datasetId']
m = 100
n=1

# Create synthetic duplicates
dupdf = pd.concat([df[:m]]*n)

# IMPORTANT: Reset index BEFORE the loop
dupdf = dupdf.reset_index(drop=True)  # This creates a clean 0-99 index

# Now create unique datasetIds
dupdf['datasetId'] = [f'{i}' for i in range(len(dupdf))]

# Or if you want to keep track of which are duplicates:
for i in range(len(dupdf)):
    original_idx = i % m 
    dupdf.loc[i, 'datasetId'] = f"record_{original_idx}_copy_{i//m}"

# Set index at the end if needed
dupdf.index = range(len(dupdf))#dupdf['datasetId']

In [6]:

Copied!

dupdf.index
dupdf.index

Out[6]:

RangeIndex(start=0, stop=100, step=1)

In [7]:

Copied!

dupdf.datasetId
dupdf.datasetId

Out[7]:

0      record_0_copy_0
1      record_1_copy_0
2      record_2_copy_0
3      record_3_copy_0
4      record_4_copy_0
            ...       
95    record_95_copy_0
96    record_96_copy_0
97    record_97_copy_0
98    record_98_copy_0
99    record_99_copy_0
Name: datasetId, Length: 100, dtype: object

Save duplicate free dataframe¶

In [8]:

Copied!





print("=== BEFORE SAVE ===")
print("Type of paleoData_values[0]:", type(dupdf['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf['paleoData_values'].iloc[0][:5])))
print("=== BEFORE SAVE ===")
print("Type of paleoData_values[0]:", type(dupdf['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf['paleoData_values'].iloc[0][:5])))

=== BEFORE SAVE ===
Type of paleoData_values[0]: <class 'numpy.ndarray'>
Dtype of paleoData_values[0]: float32
Shape: (1220,)
First 5 values: [-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]
String repr: '[-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]'

In [9]:

Copied!

dupdf = dupdf[sorted(dupdf.columns)]
dupdf.name='dup_test'
os.makedirs(f'data/{dupdf.name}/', exist_ok=True)
dupdf = dupdf[sorted(dupdf.columns)]
dupdf.name='dup_test'
os.makedirs(f'data/{dupdf.name}/', exist_ok=True)

In [10]:

Copied!

dupdf.info()
print(dupdf.name)
dupdf.info()
print(dupdf.name)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    object 
 13  paleoData_notes                100 non-null    object 
 14  paleoData_proxy                100 non-null    object 
 15  paleoData_sensorSpecies        100 non-null    object 
 16  paleoData_units                100 non-null    object 
 17  paleoData_values               100 non-null    object 
 18  paleoData_variableName         100 non-null    object 
 19  year                           100 non-null    object 
 20  yearUnits                      100 non-null    object 
dtypes: float32(3), object(18)
memory usage: 15.4+ KB
dup_test

In [11]:

Copied!

dupdf.info()
dupdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    object 
 13  paleoData_notes                100 non-null    object 
 14  paleoData_proxy                100 non-null    object 
 15  paleoData_sensorSpecies        100 non-null    object 
 16  paleoData_units                100 non-null    object 
 17  paleoData_values               100 non-null    object 
 18  paleoData_variableName         100 non-null    object 
 19  year                           100 non-null    object 
 20  yearUnits                      100 non-null    object 
dtypes: float32(3), object(18)
memory usage: 15.4+ KB

save pickle¶

In [12]:

Copied!

# save concatenate dataframe as db_merged
dupdf.to_pickle(f'data/{dupdf.name}/{dupdf.name}_compact.pkl')
# save concatenate dataframe as db_merged
dupdf.to_pickle(f'data/{dupdf.name}/{dupdf.name}_compact.pkl')

save csv¶

In [13]:

Copied!

# save to a list of csv files (metadata, data, year)
utf.write_compact_dataframe_to_csv(dupdf)
# save to a list of csv files (metadata, data, year)
utf.write_compact_dataframe_to_csv(dupdf)

METADATA: datasetId, archiveType, dataSetName, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, interpretation_direction, interpretation_seasonality, interpretation_variable, interpretation_variableDetail, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, paleoData_variableName, yearUnits
Saved to /home/jupyter-lluecke/dod2k_v2.0/dod2k/data/dup_test/dup_test_compact_%s.csv

In [14]:

Copied!

# load dataframe
print(utf.load_compact_dataframe_from_csv(dupdf.name).info())
# load dataframe
print(utf.load_compact_dataframe_from_csv(dupdf.name).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    object 
 13  paleoData_notes                100 non-null    object 
 14  paleoData_proxy                100 non-null    object 
 15  paleoData_sensorSpecies        100 non-null    object 
 16  paleoData_units                100 non-null    object 
 17  paleoData_values               100 non-null    object 
 18  paleoData_variableName         100 non-null    object 
 19  year                           100 non-null    object 
 20  yearUnits                      100 non-null    object 
dtypes: float32(3), object(18)
memory usage: 15.4+ KB
None

In [15]:

Copied!

dupdf
dupdf

Out[15]:

	archiveType	dataSetName	datasetId	geo_meanElev	geo_meanLat	geo_meanLon	geo_siteName	interpretation_direction	interpretation_seasonality	interpretation_variable	...	originalDataURL	originalDatabase	paleoData_notes	paleoData_proxy	paleoData_sensorSpecies	paleoData_units	paleoData_values	paleoData_variableName	year	yearUnits
0	GlacierIce	Ant-WDC05A.Steig.2013	record_0_copy_0	1806.000000	-79.459999	-112.089996	WDC05A	positive	Annual	temperature	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	; climateInterpretation_seasonality changed - ...	d18O	nan	permil	[-33.32873, -35.6732, -33.1574, -34.2854, -34....	d18O	[2005.0, 2004.0, 2003.0, 2002.0, 2001.0, 2000....	CE
1	Wood	NAm-MtLemon.Briffa.2002	record_1_copy_0	2700.000000	32.500000	-110.800003	Mt. Lemon	None	None	None	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	ring width	PSME	cm	[2.76, 2.91, 1.88, 2.51, 2.5, 1.79, 0.915, 0.6...	ring width	[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....	CE
2	Wood	NAm-MtLemon.Briffa.2002	record_2_copy_0	2700.000000	32.500000	-110.800003	Mt. Lemon	None	None	None	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	ring width	PSME	nan	[1.141, 1.198, 0.881, 1.091, 1.097, 0.873, 0.6...	ring width	[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....	CE
3	Wood	NAm-MtLemon.Briffa.2002	record_3_copy_0	2700.000000	32.500000	-110.800003	Mt. Lemon	None	None	None	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	residual chronology	PSME	nan	[1.116, 1.152, 0.768, 1.151, 1.075, 0.811, 0.7...	residual chronology	[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....	CE
4	Wood	NAm-MtLemon.Briffa.2002	record_4_copy_0	2700.000000	32.500000	-110.800003	Mt. Lemon	None	None	None	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	ARSTAN	PSME	nan	[1.143, 1.223, 0.876, 1.1, 1.126, 0.874, 0.679...	ARSTAN	[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....	CE
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	Coral	Ocn-Rarotonga_d18O2R.Linsley.2006	record_95_copy_0	-18.299999	-21.237801	-159.827805	Rarotonga	None	None	None	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	d13C	lutea	permil	[-2.82, -2.91, -3.01, -3.27, -3.12, -2.84, -2....	d13C	[1996.91, 1996.78, 1996.66, 1996.53, 1996.41, ...	CE
96	Wood	Asi-KYRG014.Solomina.2013	record_96_copy_0	69.000000	42.419998	78.970001	KYRG014	positive	Summer	temperature	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	ring width	nan	nan	[0.885, 1.266, 0.865, 0.979, 1.262, 1.032, 1.2...	ring width	[1551.0, 1552.0, 1553.0, 1554.0, 1555.0, 1556....	CE
97	Coral	Ocn-Lombok.Charles.2003	record_97_copy_0	-3.000000	-8.247300	115.575699	Lombok	negative	subannual	temperature	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	; climateInterpretation_seasonality changed - ...	d18O	NA	permil	[-5.284, -5.114, -5.333, -5.365, -5.237, -5.36...	d18O	[1990.0, 1989.92, 1989.83, 1989.75, 1989.67, 1...	CE
98	LakeSediment	Arc-HudsonLake.Clegg.2011	record_98_copy_0	657.000000	61.900002	-145.660004	Hudson Lake	positive	Jul	temperature	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	nan	chironomid	nan	degC	[12.4427, 11.8305, 11.9809, 12.1493, 12.684, 1...	temperature	[1996.8, 1982.85, 1963.95, 1952.0, 1934.4, 190...	CE
99	Coral	Ocn-SavusavuBayFiji.Bagnato.2005	record_99_copy_0	-2.000000	-16.820000	179.229996	Savusavu Bay, Fiji	negative	Annual	temperature	...	https://www1.ncdc.noaa.gov/pub/data/paleo/page...	PAGES 2k v2.2.0	; climateInterpretation_seasonality changed - ...	d18O	heliopora	permil	[-0.046, -0.134, 0.03, 0.363, 0.224, -0.14, 0....	d18O	[2001.0, 2000.0, 1999.0, 1998.0, 1997.0, 1996....	CE

100 rows × 21 columns

In [16]:

Copied!





dupdf_reloaded = utf.load_compact_dataframe_from_csv('dup_test')

print("\n=== AFTER RELOAD ===")
print("Type of paleoData_values[0]:", type(dupdf_reloaded['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf_reloaded['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf_reloaded['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf_reloaded['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf_reloaded['paleoData_values'].iloc[0][:5])))

# Direct comparison
print("\n=== COMPARISON ===")
print("Arrays equal?:", np.array_equal(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0]))
print("Arrays allclose?:", np.allclose(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0], 
                                       equal_nan=True))
dupdf_reloaded = utf.load_compact_dataframe_from_csv('dup_test')

print("\n=== AFTER RELOAD ===")
print("Type of paleoData_values[0]:", type(dupdf_reloaded['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf_reloaded['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf_reloaded['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf_reloaded['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf_reloaded['paleoData_values'].iloc[0][:5])))

# Direct comparison
print("\n=== COMPARISON ===")
print("Arrays equal?:", np.array_equal(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0]))
print("Arrays allclose?:", np.allclose(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0], 
                                       equal_nan=True))

=== AFTER RELOAD ===
Type of paleoData_values[0]: <class 'numpy.ndarray'>
Dtype of paleoData_values[0]: float32
Shape: (1220,)
First 5 values: [-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]
String repr: '[-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]'

=== COMPARISON ===
Arrays equal?: True
Arrays allclose?: True

In [17]:

Copied!

os.getcwd()
os.getcwd()

Out[17]:

'/home/jupyter-lluecke/dod2k_v2.0/dod2k'

In [ ]: