Skip to content

Commit

Permalink
Add end-to-end tests at the end of Datalab Quickstart tutorial (#1118)
Browse files Browse the repository at this point in the history
Co-authored-by: Elías Snorrason <eliassno@gmail.com>
  • Loading branch information
allincowell and elisno committed May 14, 2024
1 parent ab05f86 commit 2870e19
Showing 1 changed file with 46 additions and 0 deletions.
46 changes: 46 additions & 0 deletions docs/source/tutorials/datalab/datalab_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,52 @@
"\n",
"To learn more, check out this [example notebook](https://github.com/cleanlab/examples/blob/master/datalab_image_classification/datalab.ipynb) (demonstrates Datalab applied to a real dataset) and the [advanced Datalab tutorial](datalab_advanced.html) (demonstrates configuration and customization options to exert greater control)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"nbsphinx": "hidden"
},
"outputs": [],
"source": [
"# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"issue_results = lab.get_issues(\"label\")\n",
"outlier_results = lab.get_issues(\"outlier\")\n",
"duplicate_results = lab.get_issues(\"near_duplicate\")\n",
"\n",
"def jaccard_similarity(l1, l2):\n",
" s1 = set(l1)\n",
" s2 = set(l2)\n",
" intersect_set = s1.intersection(s2)\n",
" union_set = s1.union(s2)\n",
" if len(intersect_set) == 0:\n",
" return 0\n",
" return len(intersect_set) / len(union_set)\n",
"\n",
"identified_label_issues_indices = issue_results[issue_results[\"is_label_issue\"] == True].index.tolist()\n",
"label_issue_indices = np.where(y_train_idx != noisy_labels_idx)[0]\n",
"\n",
"label_quality_scores = issue_results[\"label_score\"].tolist()\n",
"Z = (y_train_idx == noisy_labels_idx).astype(float).tolist()\n",
"\n",
"identified_outlier_issues_indices = outlier_results[outlier_results[\"is_outlier_issue\"] == True].index.to_list()\n",
"outlier_issue_indices = list(range(125, 130+1))\n",
"exact_duplicate_idx = [index for index, elem in enumerate(X_train) if (elem == X_duplicate).all()][0]\n",
"if exact_duplicate_idx >= 125: # if the random index selected to create a duplicate >= 125, then the last point is also an outlier\n",
" outlier_issue_indices.append(131)\n",
" \n",
"identified_duplicate_issues_indices = duplicate_results[duplicate_results[\"is_near_duplicate_issue\"] == True].index.tolist()\n",
"duplicate_issue_indices = [exact_duplicate_idx, 129, 130, 131]\n",
"\n",
"\n",
"assert jaccard_similarity(identified_label_issues_indices, label_issue_indices) > 0.4\n",
"assert roc_auc_score(Z, label_quality_scores) > 0.9\n",
"assert jaccard_similarity(identified_outlier_issues_indices, outlier_issue_indices) > 0.9\n",
"assert jaccard_similarity(identified_duplicate_issues_indices, duplicate_issue_indices) > 0.9"
]
}
],
"metadata": {
Expand Down

0 comments on commit 2870e19

Please sign in to comment.