[docs]classClusterLabelsEvalStats(EvalStats[TMetric],ABC):NUM_CLUSTERS="numClusters"AV_SIZE="averageClusterSize"MEDIAN_SIZE="medianClusterSize"STDDEV_SIZE="clusterSizeStd"MIN_SIZE="minClusterSize"MAX_SIZE="maxClusterSize"NOISE_SIZE="noiseClusterSize"def__init__(self,labels:Sequence[int],noise_label:int,default_metrics:List[TMetric],additional_metrics:List[TMetric]=None):self.labels=np.array(labels)self.noiseLabel=noise_label# splitting off noise cluster from other clusters, computing cluster size distributionself.clusterLabelsMask:np.ndarray=self.labels!=noise_labelself.noiseLabelsMask:np.ndarray=np.logical_not(self.clusterLabelsMask)self.clustersLabels=self.labels[self.clusterLabelsMask]self.clusterIdentifiers,self.clusterSizeDistribution= \
np.unique(self.labels[self.clusterLabelsMask],return_counts=True)self.noiseClusterSize=self.noiseLabelsMask.sum()# operations like max and min raise an exception for empty arrays, this counteracts this effectiflen(self.clusterSizeDistribution)==0:self.clusterSizeDistribution=np.zeros(1)super().__init__(default_metrics,additional_metrics=additional_metrics)
[docs]defcompute_value_for_eval_stats(self,eval_stats:"ClusteringUnsupervisedEvalStats")->float:iflen(eval_stats.clustersLabels)==0:# all is noisereturn0returnself.compute_value(eval_stats.clustersDatapoints,eval_stats.clustersLabels)
[docs]classDaviesBouldinScore(RemovedNoiseUnsupervisedMetric):name="DaviesBouldinScore"# TODO: I think in some edge cases this score could be larger than one, one should look into thatworstValue=1
[docs]classClusteringUnsupervisedEvalStats(ClusterLabelsEvalStats[ClusteringUnsupervisedMetric]):""" Class containing methods to compute evaluation statistics of a clustering result """def__init__(self,datapoints:np.ndarray,labels:Sequence[int],noise_label=-1,metrics:Sequence[ClusteringUnsupervisedMetric]=None,additional_metrics:Sequence[ClusteringUnsupervisedMetric]=None):""" :param datapoints: datapoints that were clustered :param labels: sequence of labels, usually the output of some clustering algorithm :param metrics: the metrics to compute. If None, will compute default metrics :param additional_metrics: the metrics to additionally compute """ifnotlen(labels)==len(datapoints):raiseValueError("Length of labels does not match length of datapoints array")ifmetricsisNone:# Silhouette score is not included by default because it takes long to computemetrics=[CalinskiHarabaszScore(),DaviesBouldinScore()]super().__init__(labels,noise_label,metrics,additional_metrics=additional_metrics)self.datapoints=datapointsself.clustersDatapoints=self.datapoints[self.clusterLabelsMask]self.noiseDatapoints=self.datapoints[self.noiseLabelsMask]
[docs]classClusteringSupervisedEvalStats(ClusterLabelsEvalStats[ClusteringSupervisedMetric]):""" Class containing methods to compute evaluation statistics a clustering result based on ground truth clusters """def__init__(self,labels:Sequence[int],true_labels:Sequence[int],noise_label=-1,metrics:Sequence[ClusteringSupervisedMetric]=None,additional_metrics:Sequence[ClusteringSupervisedMetric]=None):""" :param labels: sequence of labels, usually the output of some clustering algorithm :param true_labels: sequence of labels that represent the ground truth clusters :param metrics: the metrics to compute. If None, will compute default metrics :param additional_metrics: the metrics to additionally compute """iflen(labels)!=len(true_labels):raiseValueError("true labels must be of same shape as labels")self.trueLabels=np.array(true_labels)self._labels_with_removed_common_noise=NoneifmetricsisNone:metrics=[VMeasureScore(),FowlkesMallowsScore(),AdjustedRandScore(),AdjustedMutualInfoScore()]super().__init__(labels,noise_label,metrics,additional_metrics=additional_metrics)
[docs]deflabels_with_removed_common_noise(self)->Tuple[np.ndarray,np.ndarray]:""" :return: tuple (labels, true_labels) where points classified as noise in true and predicted data were removed """ifself._labels_with_removed_common_noiseisNone:ifself.noiseLabelisNone:self._labels_with_removed_common_noise=self.labels,self.trueLabelselse:common_noise_labels_mask=np.logical_and(self.noiseLabelsMask,self.trueLabels==self.noiseLabel)kept_labels_mask=np.logical_not(common_noise_labels_mask)self._labels_with_removed_common_noise=self.labels[kept_labels_mask],self.trueLabels[kept_labels_mask]returnself._labels_with_removed_common_noise