{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Daten deduplizieren"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Beispieldaten laden"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.220843Z",
"iopub.status.busy": "2026-05-22T14:12:55.220596Z",
"iopub.status.idle": "2026-05-22T14:12:55.440475Z",
"shell.execute_reply": "2026-05-22T14:12:55.440133Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.220825Z"
}
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.441014Z",
"iopub.status.busy": "2026-05-22T14:12:55.440879Z",
"iopub.status.idle": "2026-05-22T14:12:55.665851Z",
"shell.execute_reply": "2026-05-22T14:12:55.665372Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.441005Z"
}
},
"outputs": [],
"source": [
"customers = pd.read_csv(\n",
" \"https://raw.githubusercontent.com/kjam/data-cleaning-101/master/data/customer_data_duped.csv\",\n",
" encoding=\"utf-8\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Deduplizieren mit pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1 Überblick"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.666774Z",
"iopub.status.busy": "2026-05-22T14:12:55.666541Z",
"iopub.status.idle": "2026-05-22T14:12:55.680394Z",
"shell.execute_reply": "2026-05-22T14:12:55.680021Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.666754Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Patricia Schaefer | \n",
" Programmer, systems | \n",
" Estrada-Best | \n",
" 398 Paul Drive | \n",
" Christianview | \n",
" Delaware | \n",
" lambdavid@gmail.com | \n",
" ndavidson | \n",
"
\n",
" \n",
" | 1 | \n",
" Olivie Dubois | \n",
" Ingénieur recherche et développement en agroal... | \n",
" Moreno | \n",
" rue Lucas Benard | \n",
" Saint Anastasie-les-Bains | \n",
" AR | \n",
" berthelotjacqueline@mahe.fr | \n",
" manonallain | \n",
"
\n",
" \n",
" | 2 | \n",
" Mary Davies-Kirk | \n",
" Public affairs consultant | \n",
" Baker Ltd | \n",
" Flat 3\\nPugh mews | \n",
" Stanleyfurt | \n",
" ZA | \n",
" middletonconor@hotmail.com | \n",
" colemanmichael | \n",
"
\n",
" \n",
" | 3 | \n",
" Miroslawa Eckbauer | \n",
" Dispensing optician | \n",
" Ladeck GmbH | \n",
" Mijo-Lübs-Straße 12 | \n",
" Neubrandenburg | \n",
" Berlin | \n",
" sophia01@yahoo.de | \n",
" romanjunitz | \n",
"
\n",
" \n",
" | 4 | \n",
" Richard Bauer | \n",
" Accountant, chartered certified | \n",
" Hoffman-Rocha | \n",
" 6541 Rodriguez Wall | \n",
" Carlosmouth | \n",
" Texas | \n",
" tross@jensen-ware.org | \n",
" adam78 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2075 | \n",
" Maurice Stey | \n",
" Systems developer | \n",
" Linke Margraf GmbH & Co. OHG | \n",
" Laila-Scheibe-Allee 2/0 | \n",
" Luckenwalde | \n",
" Hamburg | \n",
" gutknechtevelyn@niemeier.com | \n",
" dkreusel | \n",
"
\n",
" \n",
" | 2076 | \n",
" Linda Alexander | \n",
" Commrcil horiculuri | \n",
" Webb, Ballald and Vasquel | \n",
" 5594 Persn Ciff | \n",
" Mooneybury | \n",
" Maryland | \n",
" ahleythoa@ail.co | \n",
" kennethrchn | \n",
"
\n",
" \n",
" | 2077 | \n",
" Diane Bailly | \n",
" Pharmacien | \n",
" Voisin | \n",
" 527, rue Dijoux | \n",
" Duval-les-Bains | \n",
" CH | \n",
" aruiz@reynaud.fr | \n",
" dorothee41 | \n",
"
\n",
" \n",
" | 2078 | \n",
" Jorge Riba Cerdán | \n",
" Hotel manager | \n",
" Amador-Diego | \n",
" Rambla de Adriana Barceló 854 Puerta 3 | \n",
" Huesca | \n",
" Asturias | \n",
" manuelamosquera@yahoo.com | \n",
" eugenia17 | \n",
"
\n",
" \n",
" | 2079 | \n",
" Ryan Thompson | \n",
" Brewing technologist | \n",
" Smith-Sullivan | \n",
" 136 Rodriguez Point | \n",
" Bradfordborough | \n",
" North Dakota | \n",
" lcruz@gmail.com | \n",
" cnewton | \n",
"
\n",
" \n",
"
\n",
"
2080 rows × 8 columns
\n",
"
"
],
"text/plain": [
" name job \\\n",
"0 Patricia Schaefer Programmer, systems \n",
"1 Olivie Dubois Ingénieur recherche et développement en agroal... \n",
"2 Mary Davies-Kirk Public affairs consultant \n",
"3 Miroslawa Eckbauer Dispensing optician \n",
"4 Richard Bauer Accountant, chartered certified \n",
"... ... ... \n",
"2075 Maurice Stey Systems developer \n",
"2076 Linda Alexander Commrcil horiculuri \n",
"2077 Diane Bailly Pharmacien \n",
"2078 Jorge Riba Cerdán Hotel manager \n",
"2079 Ryan Thompson Brewing technologist \n",
"\n",
" company street_address \\\n",
"0 Estrada-Best 398 Paul Drive \n",
"1 Moreno rue Lucas Benard \n",
"2 Baker Ltd Flat 3\\nPugh mews \n",
"3 Ladeck GmbH Mijo-Lübs-Straße 12 \n",
"4 Hoffman-Rocha 6541 Rodriguez Wall \n",
"... ... ... \n",
"2075 Linke Margraf GmbH & Co. OHG Laila-Scheibe-Allee 2/0 \n",
"2076 Webb, Ballald and Vasquel 5594 Persn Ciff \n",
"2077 Voisin 527, rue Dijoux \n",
"2078 Amador-Diego Rambla de Adriana Barceló 854 Puerta 3 \n",
"2079 Smith-Sullivan 136 Rodriguez Point \n",
"\n",
" city state email \\\n",
"0 Christianview Delaware lambdavid@gmail.com \n",
"1 Saint Anastasie-les-Bains AR berthelotjacqueline@mahe.fr \n",
"2 Stanleyfurt ZA middletonconor@hotmail.com \n",
"3 Neubrandenburg Berlin sophia01@yahoo.de \n",
"4 Carlosmouth Texas tross@jensen-ware.org \n",
"... ... ... ... \n",
"2075 Luckenwalde Hamburg gutknechtevelyn@niemeier.com \n",
"2076 Mooneybury Maryland ahleythoa@ail.co \n",
"2077 Duval-les-Bains CH aruiz@reynaud.fr \n",
"2078 Huesca Asturias manuelamosquera@yahoo.com \n",
"2079 Bradfordborough North Dakota lcruz@gmail.com \n",
"\n",
" user_name \n",
"0 ndavidson \n",
"1 manonallain \n",
"2 colemanmichael \n",
"3 romanjunitz \n",
"4 adam78 \n",
"... ... \n",
"2075 dkreusel \n",
"2076 kennethrchn \n",
"2077 dorothee41 \n",
"2078 eugenia17 \n",
"2079 cnewton \n",
"\n",
"[2080 rows x 8 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.2 Datentypen anzeigen\n",
"\n",
"Hierfür verwenden wir [pandas.DataFrame.dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dtypes.html):"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.680897Z",
"iopub.status.busy": "2026-05-22T14:12:55.680790Z",
"iopub.status.idle": "2026-05-22T14:12:55.683763Z",
"shell.execute_reply": "2026-05-22T14:12:55.683448Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.680886Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"name object\n",
"job object\n",
"company object\n",
"street_address object\n",
"city object\n",
"state object\n",
"email object\n",
"user_name object\n",
"dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Fehlende Werte ermitteln\n",
"\n",
"[pandas.isnull](https://pandas.pydata.org/docs/reference/api/pandas.isnull.html) zeigt für ein array-ähnliches Objekt an, ob Werte fehlen:\n",
"\n",
"* `NaN` in numerischen Arrays\n",
"* `None` oder `NaN` in Objekt-Arrays\n",
"* `NaT` in [datetimelike](https://pandas.pydata.org/docs/reference/general_functions.html#top-level-dealing-with-datetimelike-data)\n",
"\n",
"> **Siehe auch:**\n",
"> \n",
"> * [notna](https://pandas.pydata.org/docs/reference/api/pandas.notna.html) für die boolesche Umkehrung von [pandas.isna](https://pandas.pydata.org/docs/reference/api/pandas.isna.html)\n",
"> * [Series.isna](https://pandas.pydata.org/docs/reference/api/pandas.Series.isna.html) für die fehlenden Werte in einer Serie\n",
"> * [DataFrame.isna](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html) für die fehlenden Werte in einem DataFrame\n",
"> * [Index.isna](https://pandas.pydata.org/docs/reference/api/pandas.Index.isna.html) für die fehlenden Werte in einem Index"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.684324Z",
"iopub.status.busy": "2026-05-22T14:12:55.684231Z",
"iopub.status.idle": "2026-05-22T14:12:55.687984Z",
"shell.execute_reply": "2026-05-22T14:12:55.687752Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.684313Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"name 0\n",
"job 0\n",
"company 0\n",
"street_address 0\n",
"city 0\n",
"state 0\n",
"email 0\n",
"user_name 0\n"
]
}
],
"source": [
"for col in customers.columns:\n",
" print(col, customers[col].isna().sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.4 Duplizierte Datensätze ermitteln"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.689726Z",
"iopub.status.busy": "2026-05-22T14:12:55.689619Z",
"iopub.status.idle": "2026-05-22T14:12:55.693508Z",
"shell.execute_reply": "2026-05-22T14:12:55.693212Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.689718Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"2075 False\n",
"2076 False\n",
"2077 False\n",
"2078 False\n",
"2079 False\n",
"Length: 2080, dtype: bool"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.duplicated()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`customers.duplicated()` gibt uns noch nicht den gewünschten Hinweis, ob es doppelte Datensätze gibt. Im Folgenden lassen wir uns alle Datensätze ausgeben, für die `True` zurückgegeben wird:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.694064Z",
"iopub.status.busy": "2026-05-22T14:12:55.693985Z",
"iopub.status.idle": "2026-05-22T14:12:55.698497Z",
"shell.execute_reply": "2026-05-22T14:12:55.698205Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.694057Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [name, job, company, street_address, city, state, email, user_name]\n",
"Index: []"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers[customers.duplicated()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Offenbar gibt es keine identischen Datensätze."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.5 Duplizierte Daten löschen\n",
"\n",
"Das Löschen doppelter Datensätze mit `drop_duplicates` sollte demnach nichts ändern und die Anzahl der Datensätze bei 2080 belassen:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.698882Z",
"iopub.status.busy": "2026-05-22T14:12:55.698807Z",
"iopub.status.idle": "2026-05-22T14:12:55.704620Z",
"shell.execute_reply": "2026-05-22T14:12:55.704347Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.698865Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Patricia Schaefer | \n",
" Programmer, systems | \n",
" Estrada-Best | \n",
" 398 Paul Drive | \n",
" Christianview | \n",
" Delaware | \n",
" lambdavid@gmail.com | \n",
" ndavidson | \n",
"
\n",
" \n",
" | 1 | \n",
" Olivie Dubois | \n",
" Ingénieur recherche et développement en agroal... | \n",
" Moreno | \n",
" rue Lucas Benard | \n",
" Saint Anastasie-les-Bains | \n",
" AR | \n",
" berthelotjacqueline@mahe.fr | \n",
" manonallain | \n",
"
\n",
" \n",
" | 2 | \n",
" Mary Davies-Kirk | \n",
" Public affairs consultant | \n",
" Baker Ltd | \n",
" Flat 3\\nPugh mews | \n",
" Stanleyfurt | \n",
" ZA | \n",
" middletonconor@hotmail.com | \n",
" colemanmichael | \n",
"
\n",
" \n",
" | 3 | \n",
" Miroslawa Eckbauer | \n",
" Dispensing optician | \n",
" Ladeck GmbH | \n",
" Mijo-Lübs-Straße 12 | \n",
" Neubrandenburg | \n",
" Berlin | \n",
" sophia01@yahoo.de | \n",
" romanjunitz | \n",
"
\n",
" \n",
" | 4 | \n",
" Richard Bauer | \n",
" Accountant, chartered certified | \n",
" Hoffman-Rocha | \n",
" 6541 Rodriguez Wall | \n",
" Carlosmouth | \n",
" Texas | \n",
" tross@jensen-ware.org | \n",
" adam78 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2075 | \n",
" Maurice Stey | \n",
" Systems developer | \n",
" Linke Margraf GmbH & Co. OHG | \n",
" Laila-Scheibe-Allee 2/0 | \n",
" Luckenwalde | \n",
" Hamburg | \n",
" gutknechtevelyn@niemeier.com | \n",
" dkreusel | \n",
"
\n",
" \n",
" | 2076 | \n",
" Linda Alexander | \n",
" Commrcil horiculuri | \n",
" Webb, Ballald and Vasquel | \n",
" 5594 Persn Ciff | \n",
" Mooneybury | \n",
" Maryland | \n",
" ahleythoa@ail.co | \n",
" kennethrchn | \n",
"
\n",
" \n",
" | 2077 | \n",
" Diane Bailly | \n",
" Pharmacien | \n",
" Voisin | \n",
" 527, rue Dijoux | \n",
" Duval-les-Bains | \n",
" CH | \n",
" aruiz@reynaud.fr | \n",
" dorothee41 | \n",
"
\n",
" \n",
" | 2078 | \n",
" Jorge Riba Cerdán | \n",
" Hotel manager | \n",
" Amador-Diego | \n",
" Rambla de Adriana Barceló 854 Puerta 3 | \n",
" Huesca | \n",
" Asturias | \n",
" manuelamosquera@yahoo.com | \n",
" eugenia17 | \n",
"
\n",
" \n",
" | 2079 | \n",
" Ryan Thompson | \n",
" Brewing technologist | \n",
" Smith-Sullivan | \n",
" 136 Rodriguez Point | \n",
" Bradfordborough | \n",
" North Dakota | \n",
" lcruz@gmail.com | \n",
" cnewton | \n",
"
\n",
" \n",
"
\n",
"
2080 rows × 8 columns
\n",
"
"
],
"text/plain": [
" name job \\\n",
"0 Patricia Schaefer Programmer, systems \n",
"1 Olivie Dubois Ingénieur recherche et développement en agroal... \n",
"2 Mary Davies-Kirk Public affairs consultant \n",
"3 Miroslawa Eckbauer Dispensing optician \n",
"4 Richard Bauer Accountant, chartered certified \n",
"... ... ... \n",
"2075 Maurice Stey Systems developer \n",
"2076 Linda Alexander Commrcil horiculuri \n",
"2077 Diane Bailly Pharmacien \n",
"2078 Jorge Riba Cerdán Hotel manager \n",
"2079 Ryan Thompson Brewing technologist \n",
"\n",
" company street_address \\\n",
"0 Estrada-Best 398 Paul Drive \n",
"1 Moreno rue Lucas Benard \n",
"2 Baker Ltd Flat 3\\nPugh mews \n",
"3 Ladeck GmbH Mijo-Lübs-Straße 12 \n",
"4 Hoffman-Rocha 6541 Rodriguez Wall \n",
"... ... ... \n",
"2075 Linke Margraf GmbH & Co. OHG Laila-Scheibe-Allee 2/0 \n",
"2076 Webb, Ballald and Vasquel 5594 Persn Ciff \n",
"2077 Voisin 527, rue Dijoux \n",
"2078 Amador-Diego Rambla de Adriana Barceló 854 Puerta 3 \n",
"2079 Smith-Sullivan 136 Rodriguez Point \n",
"\n",
" city state email \\\n",
"0 Christianview Delaware lambdavid@gmail.com \n",
"1 Saint Anastasie-les-Bains AR berthelotjacqueline@mahe.fr \n",
"2 Stanleyfurt ZA middletonconor@hotmail.com \n",
"3 Neubrandenburg Berlin sophia01@yahoo.de \n",
"4 Carlosmouth Texas tross@jensen-ware.org \n",
"... ... ... ... \n",
"2075 Luckenwalde Hamburg gutknechtevelyn@niemeier.com \n",
"2076 Mooneybury Maryland ahleythoa@ail.co \n",
"2077 Duval-les-Bains CH aruiz@reynaud.fr \n",
"2078 Huesca Asturias manuelamosquera@yahoo.com \n",
"2079 Bradfordborough North Dakota lcruz@gmail.com \n",
"\n",
" user_name \n",
"0 ndavidson \n",
"1 manonallain \n",
"2 colemanmichael \n",
"3 romanjunitz \n",
"4 adam78 \n",
"... ... \n",
"2075 dkreusel \n",
"2076 kennethrchn \n",
"2077 dorothee41 \n",
"2078 eugenia17 \n",
"2079 cnewton \n",
"\n",
"[2080 rows x 8 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.drop_duplicates()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nun wollen wir uns diejenigen Datensätze anzeigen lassen, bei denen `user_name` identisch ist:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.704981Z",
"iopub.status.busy": "2026-05-22T14:12:55.704889Z",
"iopub.status.idle": "2026-05-22T14:12:55.711078Z",
"shell.execute_reply": "2026-05-22T14:12:55.710850Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.704965Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 337 | \n",
" Aysel Binner | \n",
" Reccig officer | \n",
" Kuhl Kalleww Swifwunw & Co. KGaA | \n",
" Batix-Kanz-Staß 5/4 | \n",
" Fulda | \n",
" Berli | \n",
" frncoise@wgnerco | \n",
" christinefinke | \n",
"
\n",
" \n",
" | 377 | \n",
" Jolanta Rogge | \n",
" Accommodation managr | \n",
" Scholl e.V. | \n",
" Lrchplz 4/6 | \n",
" Mettmnn | \n",
" Thüringen | \n",
" inrharff@yah.d | \n",
" walentinabeier | \n",
"
\n",
" \n",
" | 506 | \n",
" Mrs. Frances Peters | \n",
" Fuiue desie | \n",
" Rsgers, Lawrence and Richards | \n",
" Studio \\nCarpntr kys | \n",
" Wes Simn | \n",
" BO | \n",
" halenewilliams@wilson-sandes.og | \n",
" amy17 | \n",
"
\n",
" \n",
" | 545 | \n",
" Gerhart Krebs MBA. | \n",
" Surgeon | \n",
" Roskoth | \n",
" Kühnertweg 863 | \n",
" Stade | \n",
" Bayern | \n",
" olav44@bolander.de | \n",
" bettyhahn | \n",
"
\n",
" \n",
" | 592 | \n",
" Folkert Gnatz | \n",
" Meteorologist | \n",
" Bolnbach | \n",
" Heinfried-Austermühle-Ring 05 | \n",
" Eilenburg | \n",
" Thüringen | \n",
" jaentschbirgitt@boerner.org | \n",
" francesco44 | \n",
"
\n",
" \n",
" | 633 | \n",
" Manon Jacquot | \n",
" Ingénieur en aéronautique | \n",
" Jacob | \n",
" 8, chemin Éléonore Evrard | \n",
" Marechal-les-Bains | \n",
" AR | \n",
" ilemaitre@voila.fr | \n",
" astrid58 | \n",
"
\n",
" \n",
" | 658 | \n",
" Austin Waller | \n",
" Insurance risk surveyor | \n",
" Sexton Group | \n",
" 11097 Hansen Field | \n",
" Davidmouth | \n",
" Texas | \n",
" christina74@doyle-baker.biz | \n",
" olynn | \n",
"
\n",
" \n",
" | 723 | \n",
" Wanda Moran | \n",
" Solicitor, Scotland | \n",
" Estes PLC | \n",
" 08011 Hernandez Streets Apt. 149 | \n",
" Natalieshire | \n",
" Oregon | \n",
" howardreginald@gmail.com | \n",
" dana91 | \n",
"
\n",
" \n",
" | 762 | \n",
" Charles Russell | \n",
" Scientist, research (physical sciences) | \n",
" Preston-Wilson | \n",
" 6709 Ashley Circle Apt. 309 | \n",
" Danielberg | \n",
" South Dakota | \n",
" nancyescobar@brown.net | \n",
" ruben71 | \n",
"
\n",
" \n",
" | 772 | \n",
" Waltrud Wohlgemut | \n",
" Designer, fashion/clothing | \n",
" Nerger AG | \n",
" Elmar-Ullmann-Allee 6 | \n",
" Schlüchtern | \n",
" Rheinland-Pfalz | \n",
" auch-schlauchindietlind@gmx.de | \n",
" zitakuhl | \n",
"
\n",
" \n",
" | 783 | \n",
" Caroline Mata | \n",
" Engineer, elecrical | \n",
" Grimes Grrur | \n",
" 80157 Whte Alley Sute 79 | \n",
" Soh Mark | \n",
" Iw | \n",
" jared52@aoo.com | \n",
" thomasthompson | \n",
"
\n",
" \n",
" | 889 | \n",
" Ricardo Ripoll Lucena | \n",
" Teevisi camera peratr | \n",
" Luzq Estraqa anq Galinqq | \n",
" Caejón Rosario Viapana 16 | \n",
" Palencia | \n",
" Lgo | \n",
" ev0@oo.com | \n",
" colomerenrique | \n",
"
\n",
" \n",
" | 928 | \n",
" Sophie Letellier du Carpentier | \n",
" Cnucteu e ét | \n",
" Valle7 SARL | \n",
" 3, boulvard Jan Augr | \n",
" Saint Daviddan | \n",
" BS | \n",
" rdorm@dbmi.com | \n",
" anne28 | \n",
"
\n",
" \n",
" | 979 | \n",
" Irene Roda Dávila | \n",
" Eitor, maazine featres | \n",
" Daza Inc | \n",
" Roda Carla Miró 5 | \n",
" Viy | \n",
" La Rioa | \n",
" sldrpére@ps.cm | \n",
" ipeñalver | \n",
"
\n",
" \n",
" | 995 | \n",
" Abigail Hernandez | \n",
" Mechanical engineer | \n",
" Smith Ltd | \n",
" 766 Adrian Ranch | \n",
" Ellismouth | \n",
" Colorado | \n",
" jordan60@gmail.com | \n",
" mendozajody | \n",
"
\n",
" \n",
" | 1015 | \n",
" Mr. Paul Newton | \n",
" Government soa researh offer | \n",
" LemnardmWatsmn | \n",
" Studi 86\\nKaty ill | \n",
" West Jue | \n",
" VE | \n",
" em@mil.cm | \n",
" bbennett | \n",
"
\n",
" \n",
" | 1043 | \n",
" Anna Adams | \n",
" Programmer alcatons | \n",
" Jones Gjoup | \n",
" 22 Kateen ova | \n",
" Noth Joa | \n",
" KZ | \n",
" asleig65@aisay.co | \n",
" lloydann | \n",
"
\n",
" \n",
" | 1052 | \n",
" Aurélie Vidal | \n",
" Magistrat | \n",
" Martins | \n",
" 88, rue Stéphanie Letellier | \n",
" Rouxnec | \n",
" SE | \n",
" boutineric@blin.fr | \n",
" iwagner | \n",
"
\n",
" \n",
" | 1062 | \n",
" Regina Schacht-Kusch | \n",
" Herbalist | \n",
" Hartung GmbH & Co. KGaA | \n",
" Wenke-Hörle-Ring 36 | \n",
" Eggenfelden | \n",
" Sachsen-Anhalt | \n",
" oluebs@troest.de | \n",
" xklotz | \n",
"
\n",
" \n",
" | 1120 | \n",
" Jeffrey Benjamin | \n",
" Publ house manager | \n",
" Chcn Inc | \n",
" 27 Rodgrs Rdgs Apt. 269 | \n",
" Suth Jeffererg | \n",
" Iinois | \n",
" stepanie90@rogers.co | \n",
" lori67 | \n",
"
\n",
" \n",
" | 1170 | \n",
" Julio Agustín Amaya | \n",
" Tax aviser | \n",
" Piñolk Belmonke and Codina | \n",
" Calleón de Gregorio Bustamante 28 Piso 7 | \n",
" La Pala | \n",
" Salamanca | \n",
" usolana@jáuregui-pedraza.om | \n",
" gloriaolmo | \n",
"
\n",
" \n",
" | 1339 | \n",
" Ing. Andrew Schleich B.A. | \n",
" Ln | \n",
" Holt Putz GnR | \n",
" Hugasse 8/8 | \n",
" Hainichn | \n",
" Neersachsen | \n",
" jun@putz.com | \n",
" jesselmaja | \n",
"
\n",
" \n",
" | 1360 | \n",
" Frédérique Lejeune-Daniel | \n",
" Tecce cse | \n",
" Sctmitt | \n",
" chemin Denise Ferrand | \n",
" Saint ChalotteVille | \n",
" IE | \n",
" jchretien@costacom | \n",
" joseph60 | \n",
"
\n",
" \n",
" | 1384 | \n",
" Kenneth Moore | \n",
" Magazine journalist | \n",
" Cross, Bfll anf Diaz | \n",
" 753 Lindsey Pine | \n",
" Thompsonshe | \n",
" Colorao | \n",
" ashey28@rice.co | \n",
" todd72 | \n",
"
\n",
" \n",
" | 1423 | \n",
" Thomas Coulon | \n",
" Collecteur de fonds | \n",
" Levy | \n",
" 91, rue Laetitia Collet | \n",
" Dias-sur-Normand | \n",
" SC | \n",
" deschampsgabriel@guyot.fr | \n",
" michelepetit | \n",
"
\n",
" \n",
" | 1433 | \n",
" Jerry Barnes | \n",
" Tour mner | \n",
" Col-Wllllams | \n",
" 30 Mpy Ovepass | \n",
" Jeiferview | \n",
" Utah | \n",
" insnashl@gas-hais.cm | \n",
" christopher62 | \n",
"
\n",
" \n",
" | 1452 | \n",
" Karen Weeks | \n",
" Psychotherapist, child | \n",
" Rodriguez, Brady and Jackson | \n",
" 233 Kevin Street | \n",
" Larryside | \n",
" Indiana | \n",
" gregg39@hernandez-gomez.com | \n",
" knapprobert | \n",
"
\n",
" \n",
" | 1489 | \n",
" Herr Johann Eigenwillig | \n",
" Immigration officer | \n",
" Süßebier Hänel GmbH | \n",
" Langernplatz 0 | \n",
" Stadtsteinach | \n",
" Thüringen | \n",
" haasemarieluise@noack.com | \n",
" istoll | \n",
"
\n",
" \n",
" | 1544 | \n",
" Pasquale Schwital | \n",
" Trade mark attorney | \n",
" Finke | \n",
" Detlef-Binner-Platz 0/1 | \n",
" Burg | \n",
" Niedersachsen | \n",
" hanne-lore98@gmx.de | \n",
" thomas14 | \n",
"
\n",
" \n",
" | 1557 | \n",
" Stephanie Young | \n",
" Herpetologist | \n",
" Bryant and Sons | \n",
" 5163 Rebecca Creek Suite 421 | \n",
" North Theresaberg | \n",
" Alaska | \n",
" stephenwilliams@summers.com | \n",
" ahawkins | \n",
"
\n",
" \n",
" | 1567 | \n",
" Carolina Reguera Sanz | \n",
" Fam manae | \n",
" Cami77, C7aparr7 a7d N7gu7ra | \n",
" Vil e Imel Oorio 25 | \n",
" Madd | \n",
" Vicaya | \n",
" mordóñ@cámara.info | \n",
" eva16 | \n",
"
\n",
" \n",
" | 1616 | \n",
" Sonia Amores | \n",
" Senir tax prfessina/tax inspectr | \n",
" J5an-Núñez | \n",
" Avnida d Grgorio Manón 344 Prta 8 | \n",
" Ponevedr | \n",
" Lugo | \n",
" icent4@montenero-brroso.info | \n",
" sanmartínguillermo | \n",
"
\n",
" \n",
" | 1647 | \n",
" Juan Carlos Iker Boix Ros | \n",
" Pre phtgrapher | \n",
" Pont, P44om4r4s 4nd Arjon4 | \n",
" Pasadzo de Josep Bentez Pso | \n",
" Las Palmas | \n",
" Mia | \n",
" srgio24@gail.co | \n",
" luis-miguel23 | \n",
"
\n",
" \n",
" | 1652 | \n",
" Jörg Henschel | \n",
" Chaity office | \n",
" Schicke AG | \n",
" HennyLorchRng 484 | \n",
" Hohensein-Ensh | \n",
" BadenWürtteberg | \n",
" huerhes@hmal.de | \n",
" anne-katrin51 | \n",
"
\n",
" \n",
" | 1703 | \n",
" Marc Tate | \n",
" Ship broker | \n",
" Wagner, Mitchell and Grimes | \n",
" 721 Christopher View Suite 840 | \n",
" Watsonmouth | \n",
" Connecticut | \n",
" chenjessica@hotmail.com | \n",
" patricia34 | \n",
"
\n",
" \n",
" | 1707 | \n",
" Joseph Hines | \n",
" Pyhiatri nre | \n",
" Cr4ig, G4rci4 4nd Rich4rds | \n",
" 85663 Savage Gles | \n",
" Mcgeeon | \n",
" Als | \n",
" bcaldern@htmail.cm | \n",
" emilytorres | \n",
"
\n",
" \n",
" | 1722 | \n",
" Julie Baldwin | \n",
" Set deigner | \n",
" W5ll55mson-G5rz5 | \n",
" 58513 Paricia Res Suie 45 | \n",
" So Me | \n",
" Alaska | \n",
" diuez@uess. | \n",
" cmoss | \n",
"
\n",
" \n",
" | 1759 | \n",
" Sarah Hoffman | \n",
" Exhibitin designe | \n",
" Hensont Wiley and Ryan | \n",
" 9490 Curts Spur Sute 82 | \n",
" Jseptwn | \n",
" Arizona | \n",
" ncole@yahoo.com | \n",
" csmith | \n",
"
\n",
" \n",
" | 1796 | \n",
" Valentine Devaux-Roger | \n",
" Direceur d'ôial | \n",
" Leiris | \n",
" 57, enue de Gros | \n",
" BenadBou | \n",
" AL | \n",
" rogrlro@munoz.om | \n",
" xherve | \n",
"
\n",
" \n",
" | 1809 | \n",
" Slavica Seidel | \n",
" Psychotherapist, child | \n",
" Wulff Hande KG | \n",
" Preißgasse 0/4 | \n",
" Soest | \n",
" Rheinland-Pfalz | \n",
" tloos@krause.net | \n",
" abien | \n",
"
\n",
" \n",
" | 1820 | \n",
" Wenke Schweitzer | \n",
" Enginr, automoti | \n",
" Wesa4k KG | \n",
" Eies. 7 | \n",
" Ba Lnwra | \n",
" Thürige | \n",
" rsthveriue@mies.rg | \n",
" kwernecke | \n",
"
\n",
" \n",
" | 1829 | \n",
" Dr. Thomas Hein | \n",
" Copy | \n",
" Geisel | \n",
" Ladeckgasse 11 | \n",
" Rockenhausen | \n",
" Nordrhein-Westfalen | \n",
" grein-grotharnim@kallert.de | \n",
" siegmar08 | \n",
"
\n",
" \n",
" | 1837 | \n",
" Andrew Hart | \n",
" Engineer, civil (contracting) | \n",
" Barnett LLC | \n",
" 258 Day Hollow Suite 410 | \n",
" Kimberlyhaven | \n",
" Colorado | \n",
" brandy00@yahoo.com | \n",
" amy30 | \n",
"
\n",
" \n",
" | 1914 | \n",
" Shelby Fowler | \n",
" Air traffic controller | \n",
" Fields-Sanchez | \n",
" 533 Fitzpatrick Bypass | \n",
" Francesberg | \n",
" Michigan | \n",
" terrystephen@anderson.org | \n",
" gcain | \n",
"
\n",
" \n",
" | 1938 | \n",
" Susan Aubry | \n",
" Directeur d'agence bancaire | \n",
" Payet Georges S.A.S. | \n",
" 67, rue Inès Valentin | \n",
" Nicolas | \n",
" FI | \n",
" milletedith@sfr.fr | \n",
" tthierry | \n",
"
\n",
" \n",
" | 1948 | \n",
" Richard Karge-Kobelt | \n",
" Junalist maaine | \n",
" Abberb Keubeb AG | \n",
" Mitschkeee 8 | \n",
" Mß | \n",
" SachsnAnhalt | \n",
" nrejwgner@gmx.e | \n",
" muehlehenni | \n",
"
\n",
" \n",
" | 1960 | \n",
" Anna de Lobato | \n",
" Medcl techcl ocer | \n",
" Maciag PLC | \n",
" Calleón de Dolore Parea 21 At 7 | \n",
" Palncia | \n",
" Cantaria | \n",
" vázqzlornzo@al.om | \n",
" daniel70 | \n",
"
\n",
" \n",
" | 1968 | \n",
" Zoltan Wähner B.A. | \n",
" Professor Emerits | \n",
" Th8e8 | \n",
" Stotr. 1 | \n",
" Saulgau | \n",
" Shlsg-Holst | \n",
" arlenpruschke@salz.or | \n",
" kklemm | \n",
"
\n",
" \n",
" | 1995 | \n",
" Kenneth Dunn | \n",
" Programmer, systems | \n",
" Leonard Inc | \n",
" 5361 Patterson Mission Suite 504 | \n",
" Villaburgh | \n",
" Rhode Island | \n",
" kristen54@gmail.com | \n",
" jkent | \n",
"
\n",
" \n",
" | 2010 | \n",
" Gertraude Schomber | \n",
" Insurance risk surveyor | \n",
" Bruder | \n",
" Christa-Ullrich-Allee 0/1 | \n",
" Schwäbisch Hall | \n",
" Hessen | \n",
" gumprichalice@schmidt.de | \n",
" fruppert | \n",
"
\n",
" \n",
" | 2075 | \n",
" Maurice Stey | \n",
" Systems developer | \n",
" Linke Margraf GmbH & Co. OHG | \n",
" Laila-Scheibe-Allee 2/0 | \n",
" Luckenwalde | \n",
" Hamburg | \n",
" gutknechtevelyn@niemeier.com | \n",
" dkreusel | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name job \\\n",
"337 Aysel Binner Reccig officer \n",
"377 Jolanta Rogge Accommodation managr \n",
"506 Mrs. Frances Peters Fuiue desie \n",
"545 Gerhart Krebs MBA. Surgeon \n",
"592 Folkert Gnatz Meteorologist \n",
"633 Manon Jacquot Ingénieur en aéronautique \n",
"658 Austin Waller Insurance risk surveyor \n",
"723 Wanda Moran Solicitor, Scotland \n",
"762 Charles Russell Scientist, research (physical sciences) \n",
"772 Waltrud Wohlgemut Designer, fashion/clothing \n",
"783 Caroline Mata Engineer, elecrical \n",
"889 Ricardo Ripoll Lucena Teevisi camera peratr \n",
"928 Sophie Letellier du Carpentier Cnucteu e ét \n",
"979 Irene Roda Dávila Eitor, maazine featres \n",
"995 Abigail Hernandez Mechanical engineer \n",
"1015 Mr. Paul Newton Government soa researh offer \n",
"1043 Anna Adams Programmer alcatons \n",
"1052 Aurélie Vidal Magistrat \n",
"1062 Regina Schacht-Kusch Herbalist \n",
"1120 Jeffrey Benjamin Publ house manager \n",
"1170 Julio Agustín Amaya Tax aviser \n",
"1339 Ing. Andrew Schleich B.A. Ln \n",
"1360 Frédérique Lejeune-Daniel Tecce cse \n",
"1384 Kenneth Moore Magazine journalist \n",
"1423 Thomas Coulon Collecteur de fonds \n",
"1433 Jerry Barnes Tour mner \n",
"1452 Karen Weeks Psychotherapist, child \n",
"1489 Herr Johann Eigenwillig Immigration officer \n",
"1544 Pasquale Schwital Trade mark attorney \n",
"1557 Stephanie Young Herpetologist \n",
"1567 Carolina Reguera Sanz Fam manae \n",
"1616 Sonia Amores Senir tax prfessina/tax inspectr \n",
"1647 Juan Carlos Iker Boix Ros Pre phtgrapher \n",
"1652 Jörg Henschel Chaity office \n",
"1703 Marc Tate Ship broker \n",
"1707 Joseph Hines Pyhiatri nre \n",
"1722 Julie Baldwin Set deigner \n",
"1759 Sarah Hoffman Exhibitin designe \n",
"1796 Valentine Devaux-Roger Direceur d'ôial \n",
"1809 Slavica Seidel Psychotherapist, child \n",
"1820 Wenke Schweitzer Enginr, automoti \n",
"1829 Dr. Thomas Hein Copy \n",
"1837 Andrew Hart Engineer, civil (contracting) \n",
"1914 Shelby Fowler Air traffic controller \n",
"1938 Susan Aubry Directeur d'agence bancaire \n",
"1948 Richard Karge-Kobelt Junalist maaine \n",
"1960 Anna de Lobato Medcl techcl ocer \n",
"1968 Zoltan Wähner B.A. Professor Emerits \n",
"1995 Kenneth Dunn Programmer, systems \n",
"2010 Gertraude Schomber Insurance risk surveyor \n",
"2075 Maurice Stey Systems developer \n",
"\n",
" company \\\n",
"337 Kuhl Kalleww Swifwunw & Co. KGaA \n",
"377 Scholl e.V. \n",
"506 Rsgers, Lawrence and Richards \n",
"545 Roskoth \n",
"592 Bolnbach \n",
"633 Jacob \n",
"658 Sexton Group \n",
"723 Estes PLC \n",
"762 Preston-Wilson \n",
"772 Nerger AG \n",
"783 Grimes Grrur \n",
"889 Luzq Estraqa anq Galinqq \n",
"928 Valle7 SARL \n",
"979 Daza Inc \n",
"995 Smith Ltd \n",
"1015 LemnardmWatsmn \n",
"1043 Jones Gjoup \n",
"1052 Martins \n",
"1062 Hartung GmbH & Co. KGaA \n",
"1120 Chcn Inc \n",
"1170 Piñolk Belmonke and Codina \n",
"1339 Holt Putz GnR \n",
"1360 Sctmitt \n",
"1384 Cross, Bfll anf Diaz \n",
"1423 Levy \n",
"1433 Col-Wllllams \n",
"1452 Rodriguez, Brady and Jackson \n",
"1489 Süßebier Hänel GmbH \n",
"1544 Finke \n",
"1557 Bryant and Sons \n",
"1567 Cami77, C7aparr7 a7d N7gu7ra \n",
"1616 J5an-Núñez \n",
"1647 Pont, P44om4r4s 4nd Arjon4 \n",
"1652 Schicke AG \n",
"1703 Wagner, Mitchell and Grimes \n",
"1707 Cr4ig, G4rci4 4nd Rich4rds \n",
"1722 W5ll55mson-G5rz5 \n",
"1759 Hensont Wiley and Ryan \n",
"1796 Leiris \n",
"1809 Wulff Hande KG \n",
"1820 Wesa4k KG \n",
"1829 Geisel \n",
"1837 Barnett LLC \n",
"1914 Fields-Sanchez \n",
"1938 Payet Georges S.A.S. \n",
"1948 Abberb Keubeb AG \n",
"1960 Maciag PLC \n",
"1968 Th8e8 \n",
"1995 Leonard Inc \n",
"2010 Bruder \n",
"2075 Linke Margraf GmbH & Co. OHG \n",
"\n",
" street_address city \\\n",
"337 Batix-Kanz-Staß 5/4 Fulda \n",
"377 Lrchplz 4/6 Mettmnn \n",
"506 Studio \\nCarpntr kys Wes Simn \n",
"545 Kühnertweg 863 Stade \n",
"592 Heinfried-Austermühle-Ring 05 Eilenburg \n",
"633 8, chemin Éléonore Evrard Marechal-les-Bains \n",
"658 11097 Hansen Field Davidmouth \n",
"723 08011 Hernandez Streets Apt. 149 Natalieshire \n",
"762 6709 Ashley Circle Apt. 309 Danielberg \n",
"772 Elmar-Ullmann-Allee 6 Schlüchtern \n",
"783 80157 Whte Alley Sute 79 Soh Mark \n",
"889 Caejón Rosario Viapana 16 Palencia \n",
"928 3, boulvard Jan Augr Saint Daviddan \n",
"979 Roda Carla Miró 5 Viy \n",
"995 766 Adrian Ranch Ellismouth \n",
"1015 Studi 86\\nKaty ill West Jue \n",
"1043 22 Kateen ova Noth Joa \n",
"1052 88, rue Stéphanie Letellier Rouxnec \n",
"1062 Wenke-Hörle-Ring 36 Eggenfelden \n",
"1120 27 Rodgrs Rdgs Apt. 269 Suth Jeffererg \n",
"1170 Calleón de Gregorio Bustamante 28 Piso 7 La Pala \n",
"1339 Hugasse 8/8 Hainichn \n",
"1360 chemin Denise Ferrand Saint ChalotteVille \n",
"1384 753 Lindsey Pine Thompsonshe \n",
"1423 91, rue Laetitia Collet Dias-sur-Normand \n",
"1433 30 Mpy Ovepass Jeiferview \n",
"1452 233 Kevin Street Larryside \n",
"1489 Langernplatz 0 Stadtsteinach \n",
"1544 Detlef-Binner-Platz 0/1 Burg \n",
"1557 5163 Rebecca Creek Suite 421 North Theresaberg \n",
"1567 Vil e Imel Oorio 25 Madd \n",
"1616 Avnida d Grgorio Manón 344 Prta 8 Ponevedr \n",
"1647 Pasadzo de Josep Bentez Pso Las Palmas \n",
"1652 HennyLorchRng 484 Hohensein-Ensh \n",
"1703 721 Christopher View Suite 840 Watsonmouth \n",
"1707 85663 Savage Gles Mcgeeon \n",
"1722 58513 Paricia Res Suie 45 So Me \n",
"1759 9490 Curts Spur Sute 82 Jseptwn \n",
"1796 57, enue de Gros BenadBou \n",
"1809 Preißgasse 0/4 Soest \n",
"1820 Eies. 7 Ba Lnwra \n",
"1829 Ladeckgasse 11 Rockenhausen \n",
"1837 258 Day Hollow Suite 410 Kimberlyhaven \n",
"1914 533 Fitzpatrick Bypass Francesberg \n",
"1938 67, rue Inès Valentin Nicolas \n",
"1948 Mitschkeee 8 Mß \n",
"1960 Calleón de Dolore Parea 21 At 7 Palncia \n",
"1968 Stotr. 1 Saulgau \n",
"1995 5361 Patterson Mission Suite 504 Villaburgh \n",
"2010 Christa-Ullrich-Allee 0/1 Schwäbisch Hall \n",
"2075 Laila-Scheibe-Allee 2/0 Luckenwalde \n",
"\n",
" state email user_name \n",
"337 Berli frncoise@wgnerco christinefinke \n",
"377 Thüringen inrharff@yah.d walentinabeier \n",
"506 BO halenewilliams@wilson-sandes.og amy17 \n",
"545 Bayern olav44@bolander.de bettyhahn \n",
"592 Thüringen jaentschbirgitt@boerner.org francesco44 \n",
"633 AR ilemaitre@voila.fr astrid58 \n",
"658 Texas christina74@doyle-baker.biz olynn \n",
"723 Oregon howardreginald@gmail.com dana91 \n",
"762 South Dakota nancyescobar@brown.net ruben71 \n",
"772 Rheinland-Pfalz auch-schlauchindietlind@gmx.de zitakuhl \n",
"783 Iw jared52@aoo.com thomasthompson \n",
"889 Lgo ev0@oo.com colomerenrique \n",
"928 BS rdorm@dbmi.com anne28 \n",
"979 La Rioa sldrpére@ps.cm ipeñalver \n",
"995 Colorado jordan60@gmail.com mendozajody \n",
"1015 VE em@mil.cm bbennett \n",
"1043 KZ asleig65@aisay.co lloydann \n",
"1052 SE boutineric@blin.fr iwagner \n",
"1062 Sachsen-Anhalt oluebs@troest.de xklotz \n",
"1120 Iinois stepanie90@rogers.co lori67 \n",
"1170 Salamanca usolana@jáuregui-pedraza.om gloriaolmo \n",
"1339 Neersachsen jun@putz.com jesselmaja \n",
"1360 IE jchretien@costacom joseph60 \n",
"1384 Colorao ashey28@rice.co todd72 \n",
"1423 SC deschampsgabriel@guyot.fr michelepetit \n",
"1433 Utah insnashl@gas-hais.cm christopher62 \n",
"1452 Indiana gregg39@hernandez-gomez.com knapprobert \n",
"1489 Thüringen haasemarieluise@noack.com istoll \n",
"1544 Niedersachsen hanne-lore98@gmx.de thomas14 \n",
"1557 Alaska stephenwilliams@summers.com ahawkins \n",
"1567 Vicaya mordóñ@cámara.info eva16 \n",
"1616 Lugo icent4@montenero-brroso.info sanmartínguillermo \n",
"1647 Mia srgio24@gail.co luis-miguel23 \n",
"1652 BadenWürtteberg huerhes@hmal.de anne-katrin51 \n",
"1703 Connecticut chenjessica@hotmail.com patricia34 \n",
"1707 Als bcaldern@htmail.cm emilytorres \n",
"1722 Alaska diuez@uess. cmoss \n",
"1759 Arizona ncole@yahoo.com csmith \n",
"1796 AL rogrlro@munoz.om xherve \n",
"1809 Rheinland-Pfalz tloos@krause.net abien \n",
"1820 Thürige rsthveriue@mies.rg kwernecke \n",
"1829 Nordrhein-Westfalen grein-grotharnim@kallert.de siegmar08 \n",
"1837 Colorado brandy00@yahoo.com amy30 \n",
"1914 Michigan terrystephen@anderson.org gcain \n",
"1938 FI milletedith@sfr.fr tthierry \n",
"1948 SachsnAnhalt nrejwgner@gmx.e muehlehenni \n",
"1960 Cantaria vázqzlornzo@al.om daniel70 \n",
"1968 Shlsg-Holst arlenpruschke@salz.or kklemm \n",
"1995 Rhode Island kristen54@gmail.com jkent \n",
"2010 Hessen gumprichalice@schmidt.de fruppert \n",
"2075 Hamburg gutknechtevelyn@niemeier.com dkreusel "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers[customers.duplicated([\"user_name\"])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nun können wir uns die zugehörigen Datensätze anzeigen lassen, z.B. mit:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.711459Z",
"iopub.status.busy": "2026-05-22T14:12:55.711373Z",
"iopub.status.idle": "2026-05-22T14:12:55.714872Z",
"shell.execute_reply": "2026-05-22T14:12:55.714655Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.711452Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 236 | \n",
" Aysel Binner | \n",
" Recycling officer | \n",
" Kuhl Kallert Stiftung & Co. KGaA | \n",
" Beatrix-Kranz-Straße 5/4 | \n",
" Fulda | \n",
" Berlin | \n",
" francoise22@wagner.com | \n",
" christinefinke | \n",
"
\n",
" \n",
" | 337 | \n",
" Aysel Binner | \n",
" Reccig officer | \n",
" Kuhl Kalleww Swifwunw & Co. KGaA | \n",
" Batix-Kanz-Staß 5/4 | \n",
" Fulda | \n",
" Berli | \n",
" frncoise@wgnerco | \n",
" christinefinke | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name job company \\\n",
"236 Aysel Binner Recycling officer Kuhl Kallert Stiftung & Co. KGaA \n",
"337 Aysel Binner Reccig officer Kuhl Kalleww Swifwunw & Co. KGaA \n",
"\n",
" street_address city state email \\\n",
"236 Beatrix-Kranz-Straße 5/4 Fulda Berlin francoise22@wagner.com \n",
"337 Batix-Kanz-Staß 5/4 Fulda Berli frncoise@wgnerco \n",
"\n",
" user_name \n",
"236 christinefinke \n",
"337 christinefinke "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers[customers[\"user_name\"] == \"christinefinke\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Schließlich können wir diejenigen Datensätze löschen, deren `user_name` identisch ist:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.715213Z",
"iopub.status.busy": "2026-05-22T14:12:55.715122Z",
"iopub.status.idle": "2026-05-22T14:12:55.719757Z",
"shell.execute_reply": "2026-05-22T14:12:55.719466Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.715204Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Patricia Schaefer | \n",
" Programmer, systems | \n",
" Estrada-Best | \n",
" 398 Paul Drive | \n",
" Christianview | \n",
" Delaware | \n",
" lambdavid@gmail.com | \n",
" ndavidson | \n",
"
\n",
" \n",
" | 1 | \n",
" Olivie Dubois | \n",
" Ingénieur recherche et développement en agroal... | \n",
" Moreno | \n",
" rue Lucas Benard | \n",
" Saint Anastasie-les-Bains | \n",
" AR | \n",
" berthelotjacqueline@mahe.fr | \n",
" manonallain | \n",
"
\n",
" \n",
" | 2 | \n",
" Mary Davies-Kirk | \n",
" Public affairs consultant | \n",
" Baker Ltd | \n",
" Flat 3\\nPugh mews | \n",
" Stanleyfurt | \n",
" ZA | \n",
" middletonconor@hotmail.com | \n",
" colemanmichael | \n",
"
\n",
" \n",
" | 3 | \n",
" Miroslawa Eckbauer | \n",
" Dispensing optician | \n",
" Ladeck GmbH | \n",
" Mijo-Lübs-Straße 12 | \n",
" Neubrandenburg | \n",
" Berlin | \n",
" sophia01@yahoo.de | \n",
" romanjunitz | \n",
"
\n",
" \n",
" | 4 | \n",
" Richard Bauer | \n",
" Accountant, chartered certified | \n",
" Hoffman-Rocha | \n",
" 6541 Rodriguez Wall | \n",
" Carlosmouth | \n",
" Texas | \n",
" tross@jensen-ware.org | \n",
" adam78 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2074 | \n",
" Rhonda James | \n",
" Recruitment consultant | \n",
" Turner, Bradley and Scott | \n",
" 28382 Stokes Expressway | \n",
" Port Gabrielaport | \n",
" New Hampshire | \n",
" zroberts@hotmail.com | \n",
" heathscott | \n",
"
\n",
" \n",
" | 2076 | \n",
" Linda Alexander | \n",
" Commrcil horiculuri | \n",
" Webb, Ballald and Vasquel | \n",
" 5594 Persn Ciff | \n",
" Mooneybury | \n",
" Maryland | \n",
" ahleythoa@ail.co | \n",
" kennethrchn | \n",
"
\n",
" \n",
" | 2077 | \n",
" Diane Bailly | \n",
" Pharmacien | \n",
" Voisin | \n",
" 527, rue Dijoux | \n",
" Duval-les-Bains | \n",
" CH | \n",
" aruiz@reynaud.fr | \n",
" dorothee41 | \n",
"
\n",
" \n",
" | 2078 | \n",
" Jorge Riba Cerdán | \n",
" Hotel manager | \n",
" Amador-Diego | \n",
" Rambla de Adriana Barceló 854 Puerta 3 | \n",
" Huesca | \n",
" Asturias | \n",
" manuelamosquera@yahoo.com | \n",
" eugenia17 | \n",
"
\n",
" \n",
" | 2079 | \n",
" Ryan Thompson | \n",
" Brewing technologist | \n",
" Smith-Sullivan | \n",
" 136 Rodriguez Point | \n",
" Bradfordborough | \n",
" North Dakota | \n",
" lcruz@gmail.com | \n",
" cnewton | \n",
"
\n",
" \n",
"
\n",
"
2029 rows × 8 columns
\n",
"
"
],
"text/plain": [
" name job \\\n",
"0 Patricia Schaefer Programmer, systems \n",
"1 Olivie Dubois Ingénieur recherche et développement en agroal... \n",
"2 Mary Davies-Kirk Public affairs consultant \n",
"3 Miroslawa Eckbauer Dispensing optician \n",
"4 Richard Bauer Accountant, chartered certified \n",
"... ... ... \n",
"2074 Rhonda James Recruitment consultant \n",
"2076 Linda Alexander Commrcil horiculuri \n",
"2077 Diane Bailly Pharmacien \n",
"2078 Jorge Riba Cerdán Hotel manager \n",
"2079 Ryan Thompson Brewing technologist \n",
"\n",
" company street_address \\\n",
"0 Estrada-Best 398 Paul Drive \n",
"1 Moreno rue Lucas Benard \n",
"2 Baker Ltd Flat 3\\nPugh mews \n",
"3 Ladeck GmbH Mijo-Lübs-Straße 12 \n",
"4 Hoffman-Rocha 6541 Rodriguez Wall \n",
"... ... ... \n",
"2074 Turner, Bradley and Scott 28382 Stokes Expressway \n",
"2076 Webb, Ballald and Vasquel 5594 Persn Ciff \n",
"2077 Voisin 527, rue Dijoux \n",
"2078 Amador-Diego Rambla de Adriana Barceló 854 Puerta 3 \n",
"2079 Smith-Sullivan 136 Rodriguez Point \n",
"\n",
" city state email \\\n",
"0 Christianview Delaware lambdavid@gmail.com \n",
"1 Saint Anastasie-les-Bains AR berthelotjacqueline@mahe.fr \n",
"2 Stanleyfurt ZA middletonconor@hotmail.com \n",
"3 Neubrandenburg Berlin sophia01@yahoo.de \n",
"4 Carlosmouth Texas tross@jensen-ware.org \n",
"... ... ... ... \n",
"2074 Port Gabrielaport New Hampshire zroberts@hotmail.com \n",
"2076 Mooneybury Maryland ahleythoa@ail.co \n",
"2077 Duval-les-Bains CH aruiz@reynaud.fr \n",
"2078 Huesca Asturias manuelamosquera@yahoo.com \n",
"2079 Bradfordborough North Dakota lcruz@gmail.com \n",
"\n",
" user_name \n",
"0 ndavidson \n",
"1 manonallain \n",
"2 colemanmichael \n",
"3 romanjunitz \n",
"4 adam78 \n",
"... ... \n",
"2074 heathscott \n",
"2076 kennethrchn \n",
"2077 dorothee41 \n",
"2078 eugenia17 \n",
"2079 cnewton \n",
"\n",
"[2029 rows x 8 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.drop_duplicates([\"user_name\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dies löschte 51 Datensätze."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Dedupe \n",
"\n",
"Alternativ können wir die duplizierte Daten mit der [Dedupe](https://docs.dedupe.io/en/latest/)-Bibliothek erkennen, die ein flaches neuronales Netzwerk verwendet, um aus einem kleinen Training zu lernen.\n",
"\n",
"\n",
"\n",
"**Siehe auch:**\n",
"\n",
"[csvdedupe](https://github.com/dedupeio/csvdedupe) bietet ein Kommandozeilenwerkzeug für Dedupe.\n",
"
\n",
"\n",
"Zudem haben dieselben Entwickler\\*innen [parserator](https://github.com/datamade/parserator) erstellt, mit dem ihr Textfunktionen extrahieren und eure eigenen Textextraktion trainieren könnt. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.1 Dedupe konfigurieren\n",
"\n",
"Nun definieren wir die Felder, auf die bei der Deduplizierung geachtet werden soll und erstellen ein neues `deduper`-Objekt:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:55.720332Z",
"iopub.status.busy": "2026-05-22T14:12:55.720176Z",
"iopub.status.idle": "2026-05-22T14:12:56.685408Z",
"shell.execute_reply": "2026-05-22T14:12:56.684767Z",
"shell.execute_reply.started": "2026-05-22T14:12:55.720320Z"
}
},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import dedupe\n",
"\n",
"\n",
"customers = pd.read_csv(\n",
" \"https://raw.githubusercontent.com/kjam/data-cleaning-101/master/data/customer_data_duped.csv\",\n",
" encoding=\"utf-8\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:56.685908Z",
"iopub.status.busy": "2026-05-22T14:12:56.685829Z",
"iopub.status.idle": "2026-05-22T14:12:56.688615Z",
"shell.execute_reply": "2026-05-22T14:12:56.688272Z",
"shell.execute_reply.started": "2026-05-22T14:12:56.685900Z"
}
},
"outputs": [],
"source": [
"variables = [\n",
" dedupe.variables.String(\"name\"),\n",
" dedupe.variables.String(\"job\"),\n",
" dedupe.variables.String(\"company\"),\n",
" dedupe.variables.String(\"street_address\"),\n",
" dedupe.variables.String(\"city\"),\n",
" dedupe.variables.String(\"state\"),\n",
" dedupe.variables.String(\"email\"),\n",
" dedupe.variables.String(\"user_name\"),\n",
"]\n",
"\n",
"deduper = dedupe.Dedupe(variables)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wenn der Wert eines Feldes fehlt, sollte dieser fehlende Wert als `None`-Objekt dargestellt werden. Durch `'has_missing': True` wird jedoch ein neues, zusätzliches Feld erstellt, das angibt, ob die Daten vorhanden waren oder nicht, und die fehlenden Daten werden mit Null versehen.\n",
"\n",
"\n",
"\n",
"**Siehe auch**\n",
"\n",
"* [Missing Data](https://docs.dedupe.io/en/latest/Variable-definition.html#missing-data)\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:56.689086Z",
"iopub.status.busy": "2026-05-22T14:12:56.689007Z",
"iopub.status.idle": "2026-05-22T14:12:56.691399Z",
"shell.execute_reply": "2026-05-22T14:12:56.691127Z",
"shell.execute_reply.started": "2026-05-22T14:12:56.689079Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"deduper"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:56.691964Z",
"iopub.status.busy": "2026-05-22T14:12:56.691890Z",
"iopub.status.idle": "2026-05-22T14:12:56.694540Z",
"shell.execute_reply": "2026-05-22T14:12:56.694192Z",
"shell.execute_reply.started": "2026-05-22T14:12:56.691956Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(2080, 8)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Trainingsdaten erstellen"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:12:56.695051Z",
"iopub.status.busy": "2026-05-22T14:12:56.694949Z",
"iopub.status.idle": "2026-05-22T14:13:16.317069Z",
"shell.execute_reply": "2026-05-22T14:13:16.316763Z",
"shell.execute_reply.started": "2026-05-22T14:12:56.695044Z"
}
},
"outputs": [],
"source": [
"deduper.prepare_training(customers.T.to_dict())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[prepare_training](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.Dedupe.prepare_training) initialisiert das aktive Lernen mit unseren Daten und, optional, mit vorhandenen Trainingsdaten.\n",
"\n",
"`T` spiegelt den DataFrame über seine Diagonale, indem Zeilen als Spalten geschrieben werden und umgekehrt. Hierfür wird [pandas.DataFrame.transpose](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.transpose.html) verwendet."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Aktives Lernen\n",
"\n",
"Mit [dedupe.console_label](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.console_label) könnt ihr eure Dedupe-Instanz trainieren. Wenn Dedupe ein Datensatzpaar findet, werdet ihr gebeten, es als Duplikat zu kennzeichnen. Ihr könnt hierfür die Tasten `y`, `n` und `u`, um Duplikate zu kennzeichnen. Drückt `f`, wenn ihr fertig seid."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:13:16.317629Z",
"iopub.status.busy": "2026-05-22T14:13:16.317532Z",
"iopub.status.idle": "2026-05-22T14:14:10.649205Z",
"shell.execute_reply": "2026-05-22T14:14:10.648763Z",
"shell.execute_reply.started": "2026-05-22T14:13:16.317621Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Patricia Clavero Alemany\n",
"job : Scientific laboratory technician\n",
"company : Galindo, Barros and Echevarría\n",
"street_address : Vial de Nicolás Morata 18\n",
"city : Castellón\n",
"state : Baleares\n",
"email : yblanes@gmail.com\n",
"user_name : xpalau\n",
"\n",
"name : Patricia Clavero Alemany\n",
"job : Scientiic laorator technician\n",
"company : Gglindo, Bgrros gnd Eghevgrríg\n",
"street_address : Vial de Nicolás Moraa 18\n",
"city : Castllón\n",
"state : Baeares\n",
"email : yblane@gmail.om\n",
"user_name : xalau\n",
"\n",
"0/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" y\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Emily Ferguson\n",
"job : Network engineer\n",
"company : Weiss-Chase\n",
"street_address : 8572 Sara Greens\n",
"city : Rileyfurt\n",
"state : New Mexico\n",
"email : stephanieblackwell@hotmail.com\n",
"user_name : marcmiller\n",
"\n",
"name : Emily Ferguson\n",
"job : Netwr egieer\n",
"company : WeisswCwwse\n",
"street_address : 872 Sara Grees\n",
"city : Rileyurt\n",
"state : New Mexico\n",
"email : stephanelackwell@hotmalcom\n",
"user_name : macmille\n",
"\n",
"1/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" y\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Marissa Hale\n",
"job : Finncil conroller\n",
"company : GqayqWatts\n",
"street_address : 4600 Huse Neck Apt. 06\n",
"city : North Robrt\n",
"state : Idah\n",
"email : gilbetgego@otmilcom\n",
"user_name : ulieirin\n",
"\n",
"name : Marissa Hale\n",
"job : Financial controller\n",
"company : Gray-Watts\n",
"street_address : 24600 House Neck Apt. 096\n",
"city : North Robert\n",
"state : Idaho\n",
"email : gilbertgregory@hotmail.com\n",
"user_name : julieirwin\n",
"\n",
"2/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" y\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Elena Ferrer\n",
"job : Air cbi crew\n",
"company : Cjjjrej Group\n",
"street_address : Raba Susana Beengue 58 Piso 8 \n",
"city : Cárs\n",
"state : Cádiz\n",
"email : oscarlosfortny@hotmail.com\n",
"user_name : eli8\n",
"\n",
"name : Elena Ferrer\n",
"job : Air cabin crew\n",
"company : Casares Group\n",
"street_address : Rambla Susana Berenguer 58 Piso 8 \n",
"city : Cáceres\n",
"state : Cádiz\n",
"email : jose-carlosfortuny@hotmail.com\n",
"user_name : felix28\n",
"\n",
"3/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" y\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Don Wilson\n",
"job : Librarian acadmic\n",
"company : Ferlaldez, Thomal ald Rodriguez\n",
"street_address : 84984 Tiffy Pt\n",
"city : Wes Jamesmuh\n",
"state : Vrgna\n",
"email : heenerez@maicm\n",
"user_name : tindvis\n",
"\n",
"name : Don Wilson\n",
"job : Librarian, academic\n",
"company : Fernandez, Thomas and Rodriguez\n",
"street_address : 84984 Tiffany Path\n",
"city : West Jamesmouth\n",
"state : Virginia\n",
"email : helenperez@gmail.com\n",
"user_name : tinadavis\n",
"\n",
"4/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" y\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Prof. Tülay Aumann B.Eng.\n",
"job : Engineer, maintenance\n",
"company : Schmidtke Holzapfel KG\n",
"street_address : Ebertplatz 6\n",
"city : Merseburg\n",
"state : Niedersachsen\n",
"email : patrickade@yahoo.de\n",
"user_name : delia00\n",
"\n",
"name : Herr Piotr Budig B.Sc.\n",
"job : Control and instrumentation engineer\n",
"company : Käster\n",
"street_address : Kraushaarstraße 5/8\n",
"city : Burg\n",
"state : Niedersachsen\n",
"email : rosaliaschulz@yahoo.de\n",
"user_name : zkruschwitz\n",
"\n",
"5/10 positive, 0/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" n\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Hunter Jimenez\n",
"job : Sub\n",
"company : Patton, Little and Dorsey\n",
"street_address : 382 Mark Locks\n",
"city : Andrewbury\n",
"state : Minnesota\n",
"email : campbellkimberly@tapia.com\n",
"user_name : christophergonzales\n",
"\n",
"name : Shawn Dillon\n",
"job : Scientist, research (physical sciences)\n",
"company : Smith, Carney and Gamble\n",
"street_address : 1971 Jessica Locks\n",
"city : East Anthonyborough\n",
"state : Kansas\n",
"email : zgray@gmail.com\n",
"user_name : gsmith\n",
"\n",
"5/10 positive, 1/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" n\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"name : Pasquale Schwital\n",
"job : Trae mar arney\n",
"company : Fink5\n",
"street_address : Dl-Bnnr-Plz 0/1\n",
"city : Bur\n",
"state : Neersachsen\n",
"email : hann-lr@gm.d\n",
"user_name : thomas14\n",
"\n",
"name : Pasquale Schwital\n",
"job : Trade mark attorney\n",
"company : Finke\n",
"street_address : Detlef-Binner-Platz 0/1\n",
"city : Burg\n",
"state : Niedersachsen\n",
"email : hanne-lore98@gmx.de\n",
"user_name : thomas14\n",
"\n",
"5/10 positive, 2/10 negative\n",
"Do these records refer to the same thing?\n",
"(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
" f\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Finished labeling\n"
]
}
],
"source": [
"dedupe.console_label(deduper)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Die letzten beiden verglichenen Trainingsdatensätze machen deutlich, dass wir dieses Duplikat mit unserem obigen `drop_duplicates`-Beispiel nicht gelöscht haben – `clittle` und `little` wurden als unterschiedlich erkannt."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Mit [Dedupe.train](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.Dedupe.train) werden die von euch markierten Datensatzpaare zu den Trainingsdaten hinzugefügt und das Matching-Modell aktualisiert.\n",
"\n",
"Mit `index_predicates=True` berücksichtigt die Deduplizierung auch Prädikate, die auf der Indizierung der Daten beruhen.\n",
"\n",
"Wenn ihr fertig seid, speichert eure Trainingsdaten mit [Dedupe.write_settings](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.Dedupe.write_settings)."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:14:10.650172Z",
"iopub.status.busy": "2026-05-22T14:14:10.650005Z",
"iopub.status.idle": "2026-05-22T14:14:10.655826Z",
"shell.execute_reply": "2026-05-22T14:14:10.655097Z",
"shell.execute_reply.started": "2026-05-22T14:14:10.650156Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"reading from csv_example_learned_settings\n"
]
}
],
"source": [
"settings_file = \"csv_example_learned_settings\"\n",
"\n",
"if Path(settings_file).exists():\n",
" print(\"reading from\", settings_file)\n",
" with Path.open(settings_file, \"rb\") as f:\n",
" deduper = dedupe.StaticDedupe(f)\n",
"else:\n",
" deduper.train(index_predicates=True)\n",
" with Path.open(settings_file, \"wb\") as sf:\n",
" deduper.write_settings(sf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Mit [dedupe.Dedupe.partition](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.Dedupe.partition) werden Datensätze identifiziert, die sich alle auf dieselbe Entität beziehen, und als Tupel zurückgegeben, die eine Folge von Datensatz-IDs und Konfidenzwerten sind. Weitere Einzelheiten zum Konfidenzwert findet ihr unter [dedupe.Dedupe.cluster](https://docs.dedupe.io/en/latest/API-documentation.html#dedupe.Dedupe.cluster)."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:14:10.656719Z",
"iopub.status.busy": "2026-05-22T14:14:10.656556Z",
"iopub.status.idle": "2026-05-22T14:14:13.020071Z",
"shell.execute_reply": "2026-05-22T14:14:13.019720Z",
"shell.execute_reply.started": "2026-05-22T14:14:10.656704Z"
}
},
"outputs": [],
"source": [
"dupes = deduper.partition(customers.T.to_dict())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wir können uns auch nur einzelne Einträge ausgeben lassen:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:14:13.020757Z",
"iopub.status.busy": "2026-05-22T14:14:13.020636Z",
"iopub.status.idle": "2026-05-22T14:14:13.023829Z",
"shell.execute_reply": "2026-05-22T14:14:13.023598Z",
"shell.execute_reply.started": "2026-05-22T14:14:13.020745Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"((np.int64(0), np.int64(963)), (np.float32(0.9495104), np.float32(0.9495104)))"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dupes[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Diese können wir uns dann mit [pandas.DataFrame.iloc](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html) anzeigen lassen:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"execution": {
"iopub.execute_input": "2026-05-22T14:14:13.026838Z",
"iopub.status.busy": "2026-05-22T14:14:13.026746Z",
"iopub.status.idle": "2026-05-22T14:14:13.032320Z",
"shell.execute_reply": "2026-05-22T14:14:13.032080Z",
"shell.execute_reply.started": "2026-05-22T14:14:13.026830Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" job | \n",
" company | \n",
" street_address | \n",
" city | \n",
" state | \n",
" email | \n",
" user_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Patricia Schaefer | \n",
" Programmer, systems | \n",
" Estrada-Best | \n",
" 398 Paul Drive | \n",
" Christianview | \n",
" Delaware | \n",
" lambdavid@gmail.com | \n",
" ndavidson | \n",
"
\n",
" \n",
" | 963 | \n",
" Patricia Schaefer | \n",
" Prorammer, ytem | \n",
" Es:rada-Bes: | \n",
" 39 Pul Drve | \n",
" Chistianview | \n",
" Delwre | \n",
" mbdvid@gmim | \n",
" ndvdson | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name job company street_address \\\n",
"0 Patricia Schaefer Programmer, systems Estrada-Best 398 Paul Drive \n",
"963 Patricia Schaefer Prorammer, ytem Es:rada-Bes: 39 Pul Drve \n",
"\n",
" city state email user_name \n",
"0 Christianview Delaware lambdavid@gmail.com ndavidson \n",
"963 Chistianview Delwre mbdvid@gmim ndvdson "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers.iloc[[0, 963]]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.13 Kernel",
"language": "python",
"name": "python313"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}