Duplicate test database
In [ ]:
Copied!
In [1]:
Copied!
# create a duplicate database from ch2k (smallest db)
# create a duplicate database from ch2k (smallest db)
In [2]:
Copied!
%load_ext autoreload
%autoreload 2
import sys
import os
from pathlib import Path
# Add parent directory to path (works from any notebook in notebooks/)
# the repo_root should be the parent directory of the notebooks folder
current_dir = Path().resolve()
# Determine repo root
if current_dir.name == 'dod2k': repo_root = current_dir
elif current_dir.parent.name == 'dod2k': repo_root = current_dir.parent
else: raise Exception('Please review the repo root structure (see first cell).')
# Update cwd and path only if needed
if os.getcwd() != str(repo_root):
os.chdir(repo_root)
if str(repo_root) not in sys.path:
sys.path.insert(0, str(repo_root))
print(f"Repo root: {repo_root}")
if str(os.getcwd())==str(repo_root):
print(f"Working directory matches repo root. ")
%load_ext autoreload
%autoreload 2
import sys
import os
from pathlib import Path
# Add parent directory to path (works from any notebook in notebooks/)
# the repo_root should be the parent directory of the notebooks folder
current_dir = Path().resolve()
# Determine repo root
if current_dir.name == 'dod2k': repo_root = current_dir
elif current_dir.parent.name == 'dod2k': repo_root = current_dir.parent
else: raise Exception('Please review the repo root structure (see first cell).')
# Update cwd and path only if needed
if os.getcwd() != str(repo_root):
os.chdir(repo_root)
if str(repo_root) not in sys.path:
sys.path.insert(0, str(repo_root))
print(f"Repo root: {repo_root}")
if str(os.getcwd())==str(repo_root):
print(f"Working directory matches repo root. ")
Repo root: /home/jupyter-lluecke/dod2k_v2.0/dod2k Working directory matches repo root.
In [3]:
Copied!
import pandas as pd
import numpy as np
import datetime
from dod2k_utilities import ut_functions as utf # contains utility functions
from dod2k_utilities import ut_duplicate_search as dup # contains utility functions
import pandas as pd
import numpy as np
import datetime
from dod2k_utilities import ut_functions as utf # contains utility functions
from dod2k_utilities import ut_duplicate_search as dup # contains utility functions
Load dataset¶
Define the dataset which needs to be screened for duplicates. Input files for the duplicate detection mechanism need to be compact dataframes (pandas dataframes with standardised columns and entry formatting).
The function load_compact_dataframe_from_csv loads the dataframe from a csv file from data\DB\, with DB the name of the database. The database name (db_name) can be
pages2kch2kiso2ksisalfe23
for the individual databases, or
all_merged
to load the merged database of all individual databases, or can be any user defined compact dataframe.
In [4]:
Copied!
# load dataframe
db_name='all_merged'
df = utf.load_compact_dataframe_from_csv(db_name)
print(df.info())
df.name = db_name
# load dataframe
db_name='all_merged'
df = utf.load_compact_dataframe_from_csv(db_name)
print(df.info())
df.name = db_name
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5320 entries, 0 to 5319 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 archiveType 5320 non-null object 1 dataSetName 5320 non-null object 2 datasetId 5320 non-null object 3 geo_meanElev 5221 non-null float32 4 geo_meanLat 5320 non-null float32 5 geo_meanLon 5320 non-null float32 6 geo_siteName 5320 non-null object 7 interpretation_direction 5320 non-null object 8 interpretation_seasonality 5320 non-null object 9 interpretation_variable 5320 non-null object 10 interpretation_variableDetail 5320 non-null object 11 originalDataURL 5320 non-null object 12 originalDatabase 5320 non-null object 13 paleoData_notes 5320 non-null object 14 paleoData_proxy 5320 non-null object 15 paleoData_sensorSpecies 5320 non-null object 16 paleoData_units 5320 non-null object 17 paleoData_values 5320 non-null object 18 paleoData_variableName 5320 non-null object 19 year 5320 non-null object 20 yearUnits 5320 non-null object dtypes: float32(3), object(18) memory usage: 810.6+ KB None
In [5]:
Copied!
m = 100
n=1
# Create synthetic duplicates
dupdf = pd.concat([df[:m]]*n)
# IMPORTANT: Reset index BEFORE the loop
dupdf = dupdf.reset_index(drop=True) # This creates a clean 0-99 index
# Now create unique datasetIds
dupdf['datasetId'] = [f'{i}' for i in range(len(dupdf))]
# Or if you want to keep track of which are duplicates:
for i in range(len(dupdf)):
original_idx = i % m
dupdf.loc[i, 'datasetId'] = f"record_{original_idx}_copy_{i//m}"
# Set index at the end if needed
dupdf.index = range(len(dupdf))#dupdf['datasetId']
m = 100
n=1
# Create synthetic duplicates
dupdf = pd.concat([df[:m]]*n)
# IMPORTANT: Reset index BEFORE the loop
dupdf = dupdf.reset_index(drop=True) # This creates a clean 0-99 index
# Now create unique datasetIds
dupdf['datasetId'] = [f'{i}' for i in range(len(dupdf))]
# Or if you want to keep track of which are duplicates:
for i in range(len(dupdf)):
original_idx = i % m
dupdf.loc[i, 'datasetId'] = f"record_{original_idx}_copy_{i//m}"
# Set index at the end if needed
dupdf.index = range(len(dupdf))#dupdf['datasetId']
In [6]:
Copied!
dupdf.index
dupdf.index
Out[6]:
RangeIndex(start=0, stop=100, step=1)
In [7]:
Copied!
dupdf.datasetId
dupdf.datasetId
Out[7]:
0 record_0_copy_0
1 record_1_copy_0
2 record_2_copy_0
3 record_3_copy_0
4 record_4_copy_0
...
95 record_95_copy_0
96 record_96_copy_0
97 record_97_copy_0
98 record_98_copy_0
99 record_99_copy_0
Name: datasetId, Length: 100, dtype: object
Save duplicate free dataframe¶
In [8]:
Copied!
print("=== BEFORE SAVE ===")
print("Type of paleoData_values[0]:", type(dupdf['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf['paleoData_values'].iloc[0][:5])))
print("=== BEFORE SAVE ===")
print("Type of paleoData_values[0]:", type(dupdf['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf['paleoData_values'].iloc[0][:5])))
=== BEFORE SAVE === Type of paleoData_values[0]: <class 'numpy.ndarray'> Dtype of paleoData_values[0]: float32 Shape: (1220,) First 5 values: [-33.32873 -35.6732 -33.1574 -34.2854 -34.4031 ] String repr: '[-33.32873 -35.6732 -33.1574 -34.2854 -34.4031 ]'
In [9]:
Copied!
dupdf = dupdf[sorted(dupdf.columns)]
dupdf.name='dup_test'
os.makedirs(f'data/{dupdf.name}/', exist_ok=True)
dupdf = dupdf[sorted(dupdf.columns)]
dupdf.name='dup_test'
os.makedirs(f'data/{dupdf.name}/', exist_ok=True)
In [10]:
Copied!
dupdf.info()
print(dupdf.name)
dupdf.info()
print(dupdf.name)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 archiveType 100 non-null object 1 dataSetName 100 non-null object 2 datasetId 100 non-null object 3 geo_meanElev 100 non-null float32 4 geo_meanLat 100 non-null float32 5 geo_meanLon 100 non-null float32 6 geo_siteName 100 non-null object 7 interpretation_direction 100 non-null object 8 interpretation_seasonality 100 non-null object 9 interpretation_variable 100 non-null object 10 interpretation_variableDetail 100 non-null object 11 originalDataURL 100 non-null object 12 originalDatabase 100 non-null object 13 paleoData_notes 100 non-null object 14 paleoData_proxy 100 non-null object 15 paleoData_sensorSpecies 100 non-null object 16 paleoData_units 100 non-null object 17 paleoData_values 100 non-null object 18 paleoData_variableName 100 non-null object 19 year 100 non-null object 20 yearUnits 100 non-null object dtypes: float32(3), object(18) memory usage: 15.4+ KB dup_test
In [11]:
Copied!
dupdf.info()
dupdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 archiveType 100 non-null object 1 dataSetName 100 non-null object 2 datasetId 100 non-null object 3 geo_meanElev 100 non-null float32 4 geo_meanLat 100 non-null float32 5 geo_meanLon 100 non-null float32 6 geo_siteName 100 non-null object 7 interpretation_direction 100 non-null object 8 interpretation_seasonality 100 non-null object 9 interpretation_variable 100 non-null object 10 interpretation_variableDetail 100 non-null object 11 originalDataURL 100 non-null object 12 originalDatabase 100 non-null object 13 paleoData_notes 100 non-null object 14 paleoData_proxy 100 non-null object 15 paleoData_sensorSpecies 100 non-null object 16 paleoData_units 100 non-null object 17 paleoData_values 100 non-null object 18 paleoData_variableName 100 non-null object 19 year 100 non-null object 20 yearUnits 100 non-null object dtypes: float32(3), object(18) memory usage: 15.4+ KB
save pickle¶
In [12]:
Copied!
# save concatenate dataframe as db_merged
dupdf.to_pickle(f'data/{dupdf.name}/{dupdf.name}_compact.pkl')
# save concatenate dataframe as db_merged
dupdf.to_pickle(f'data/{dupdf.name}/{dupdf.name}_compact.pkl')
save csv¶
In [13]:
Copied!
# save to a list of csv files (metadata, data, year)
utf.write_compact_dataframe_to_csv(dupdf)
# save to a list of csv files (metadata, data, year)
utf.write_compact_dataframe_to_csv(dupdf)
METADATA: datasetId, archiveType, dataSetName, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, interpretation_direction, interpretation_seasonality, interpretation_variable, interpretation_variableDetail, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, paleoData_variableName, yearUnits Saved to /home/jupyter-lluecke/dod2k_v2.0/dod2k/data/dup_test/dup_test_compact_%s.csv
In [14]:
Copied!
# load dataframe
print(utf.load_compact_dataframe_from_csv(dupdf.name).info())
# load dataframe
print(utf.load_compact_dataframe_from_csv(dupdf.name).info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 archiveType 100 non-null object 1 dataSetName 100 non-null object 2 datasetId 100 non-null object 3 geo_meanElev 100 non-null float32 4 geo_meanLat 100 non-null float32 5 geo_meanLon 100 non-null float32 6 geo_siteName 100 non-null object 7 interpretation_direction 100 non-null object 8 interpretation_seasonality 100 non-null object 9 interpretation_variable 100 non-null object 10 interpretation_variableDetail 100 non-null object 11 originalDataURL 100 non-null object 12 originalDatabase 100 non-null object 13 paleoData_notes 100 non-null object 14 paleoData_proxy 100 non-null object 15 paleoData_sensorSpecies 100 non-null object 16 paleoData_units 100 non-null object 17 paleoData_values 100 non-null object 18 paleoData_variableName 100 non-null object 19 year 100 non-null object 20 yearUnits 100 non-null object dtypes: float32(3), object(18) memory usage: 15.4+ KB None
In [15]:
Copied!
dupdf
dupdf
Out[15]:
| archiveType | dataSetName | datasetId | geo_meanElev | geo_meanLat | geo_meanLon | geo_siteName | interpretation_direction | interpretation_seasonality | interpretation_variable | ... | originalDataURL | originalDatabase | paleoData_notes | paleoData_proxy | paleoData_sensorSpecies | paleoData_units | paleoData_values | paleoData_variableName | year | yearUnits | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GlacierIce | Ant-WDC05A.Steig.2013 | record_0_copy_0 | 1806.000000 | -79.459999 | -112.089996 | WDC05A | positive | Annual | temperature | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | ; climateInterpretation_seasonality changed - ... | d18O | nan | permil | [-33.32873, -35.6732, -33.1574, -34.2854, -34.... | d18O | [2005.0, 2004.0, 2003.0, 2002.0, 2001.0, 2000.... | CE |
| 1 | Wood | NAm-MtLemon.Briffa.2002 | record_1_copy_0 | 2700.000000 | 32.500000 | -110.800003 | Mt. Lemon | None | None | None | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | ring width | PSME | cm | [2.76, 2.91, 1.88, 2.51, 2.5, 1.79, 0.915, 0.6... | ring width | [1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573.... | CE |
| 2 | Wood | NAm-MtLemon.Briffa.2002 | record_2_copy_0 | 2700.000000 | 32.500000 | -110.800003 | Mt. Lemon | None | None | None | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | ring width | PSME | nan | [1.141, 1.198, 0.881, 1.091, 1.097, 0.873, 0.6... | ring width | [1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573.... | CE |
| 3 | Wood | NAm-MtLemon.Briffa.2002 | record_3_copy_0 | 2700.000000 | 32.500000 | -110.800003 | Mt. Lemon | None | None | None | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | residual chronology | PSME | nan | [1.116, 1.152, 0.768, 1.151, 1.075, 0.811, 0.7... | residual chronology | [1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573.... | CE |
| 4 | Wood | NAm-MtLemon.Briffa.2002 | record_4_copy_0 | 2700.000000 | 32.500000 | -110.800003 | Mt. Lemon | None | None | None | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | ARSTAN | PSME | nan | [1.143, 1.223, 0.876, 1.1, 1.126, 0.874, 0.679... | ARSTAN | [1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573.... | CE |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | Coral | Ocn-Rarotonga_d18O2R.Linsley.2006 | record_95_copy_0 | -18.299999 | -21.237801 | -159.827805 | Rarotonga | None | None | None | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | d13C | lutea | permil | [-2.82, -2.91, -3.01, -3.27, -3.12, -2.84, -2.... | d13C | [1996.91, 1996.78, 1996.66, 1996.53, 1996.41, ... | CE |
| 96 | Wood | Asi-KYRG014.Solomina.2013 | record_96_copy_0 | 69.000000 | 42.419998 | 78.970001 | KYRG014 | positive | Summer | temperature | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | ring width | nan | nan | [0.885, 1.266, 0.865, 0.979, 1.262, 1.032, 1.2... | ring width | [1551.0, 1552.0, 1553.0, 1554.0, 1555.0, 1556.... | CE |
| 97 | Coral | Ocn-Lombok.Charles.2003 | record_97_copy_0 | -3.000000 | -8.247300 | 115.575699 | Lombok | negative | subannual | temperature | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | ; climateInterpretation_seasonality changed - ... | d18O | NA | permil | [-5.284, -5.114, -5.333, -5.365, -5.237, -5.36... | d18O | [1990.0, 1989.92, 1989.83, 1989.75, 1989.67, 1... | CE |
| 98 | LakeSediment | Arc-HudsonLake.Clegg.2011 | record_98_copy_0 | 657.000000 | 61.900002 | -145.660004 | Hudson Lake | positive | Jul | temperature | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | nan | chironomid | nan | degC | [12.4427, 11.8305, 11.9809, 12.1493, 12.684, 1... | temperature | [1996.8, 1982.85, 1963.95, 1952.0, 1934.4, 190... | CE |
| 99 | Coral | Ocn-SavusavuBayFiji.Bagnato.2005 | record_99_copy_0 | -2.000000 | -16.820000 | 179.229996 | Savusavu Bay, Fiji | negative | Annual | temperature | ... | https://www1.ncdc.noaa.gov/pub/data/paleo/page... | PAGES 2k v2.2.0 | ; climateInterpretation_seasonality changed - ... | d18O | heliopora | permil | [-0.046, -0.134, 0.03, 0.363, 0.224, -0.14, 0.... | d18O | [2001.0, 2000.0, 1999.0, 1998.0, 1997.0, 1996.... | CE |
100 rows × 21 columns
In [16]:
Copied!
dupdf_reloaded = utf.load_compact_dataframe_from_csv('dup_test')
print("\n=== AFTER RELOAD ===")
print("Type of paleoData_values[0]:", type(dupdf_reloaded['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf_reloaded['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf_reloaded['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf_reloaded['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf_reloaded['paleoData_values'].iloc[0][:5])))
# Direct comparison
print("\n=== COMPARISON ===")
print("Arrays equal?:", np.array_equal(dupdf['paleoData_values'].iloc[0],
dupdf_reloaded['paleoData_values'].iloc[0]))
print("Arrays allclose?:", np.allclose(dupdf['paleoData_values'].iloc[0],
dupdf_reloaded['paleoData_values'].iloc[0],
equal_nan=True))
dupdf_reloaded = utf.load_compact_dataframe_from_csv('dup_test')
print("\n=== AFTER RELOAD ===")
print("Type of paleoData_values[0]:", type(dupdf_reloaded['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf_reloaded['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf_reloaded['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf_reloaded['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf_reloaded['paleoData_values'].iloc[0][:5])))
# Direct comparison
print("\n=== COMPARISON ===")
print("Arrays equal?:", np.array_equal(dupdf['paleoData_values'].iloc[0],
dupdf_reloaded['paleoData_values'].iloc[0]))
print("Arrays allclose?:", np.allclose(dupdf['paleoData_values'].iloc[0],
dupdf_reloaded['paleoData_values'].iloc[0],
equal_nan=True))
=== AFTER RELOAD === Type of paleoData_values[0]: <class 'numpy.ndarray'> Dtype of paleoData_values[0]: float32 Shape: (1220,) First 5 values: [-33.32873 -35.6732 -33.1574 -34.2854 -34.4031 ] String repr: '[-33.32873 -35.6732 -33.1574 -34.2854 -34.4031 ]' === COMPARISON === Arrays equal?: True Arrays allclose?: True
In [17]:
Copied!
os.getcwd()
os.getcwd()
Out[17]:
'/home/jupyter-lluecke/dod2k_v2.0/dod2k'
In [ ]:
Copied!