Source code for sensai.geoanalytics.geopandas.coordinate_clustering

import logging
from typing import Callable, Union, Iterable

import geopandas as gp
import numpy as np
import pandas as pd
from shapely.geometry import MultiPoint

from .coordinates import validate_coordinates, extract_coordinates_array, TCoordinates, GeoDataFrameWrapper
from ...clustering import SkLearnEuclideanClusterer
from ...clustering.clustering_base import EuclideanClusterer
from ...clustering.sklearn_clustering import SkLearnClustererProtocol
from ...util.cache import LoadSaveInterface
from ...util.profiling import timed

log = logging.getLogger(__name__)


[docs]class CoordinateEuclideanClusterer(EuclideanClusterer, GeoDataFrameWrapper): """ Wrapper around a clustering model. This class adds additional, geospatial-specific features to the provided clusterer :param clusterer: an instance of ClusteringModel """ def __init__(self, clusterer: EuclideanClusterer): self.clusterer = clusterer super().__init__(noise_label=clusterer.noiseLabel, max_cluster_size=clusterer.maxClusterSize, min_cluster_size=clusterer.minClusterSize)
[docs] class Cluster(EuclideanClusterer.Cluster, GeoDataFrameWrapper, LoadSaveInterface): """ Wrapper around a coordinates array :param coordinates: :param identifier: """ def __init__(self, coordinates: np.ndarray, identifier: Union[str, int]): validate_coordinates(coordinates) super().__init__(coordinates, identifier)
[docs] def to_geodf(self, crs='epsg:3857'): """ Export the cluster as a GeoDataFrame of length 1 with the cluster as an instance of MultiPoint and the identifier as index. :param crs: projection. By default pseudo-mercator :return: GeoDataFrame """ gdf = gp.GeoDataFrame({"geometry": [self.as_multipoint()]}, index=[self.identifier]) gdf.index.name = "identifier" gdf.crs = crs return gdf
[docs] def as_multipoint(self): """ :return: The cluster's coordinates as a MultiPoint object """ return MultiPoint(self.datapoints)
[docs] @classmethod def load(cls, path): """ Instantiate from a geopandas readable file containing a single row with an identifier and an instance of MultiPoint :param path: :return: instance of CoordinateCluster """ log.info(f"Loading instance of {cls.__name__} from {path}") gdf = gp.read_file(path) if len(gdf) != 1: raise Exception(f"Expected {path} to contain a single row, instead got {len(gdf)}") identifier, multipoint = gdf.identifier.values[0], gdf.geometry.values[0] return cls(np.array([[p.x, p.y] for p in multipoint]), identifier)
[docs] def save(self, path, crs="EPSG:3857"): """ Saves the cluster's coordinates as shapefile :param crs: :param path: :return: """ log.info(f"Saving instance of {self.__class__.__name__} as shapefile to {path}") self.to_geodf(crs).to_file(path, index=True)
def _compute_labels(self, x: np.ndarray) -> np.ndarray: validate_coordinates(x) return self.clusterer._compute_labels(x)
[docs] def fit(self, coordinates: TCoordinates): """ Fitting to coordinates from a numpy array, a MultiPoint object or a GeoDataFrame with one Point per row :param coordinates: :return: """ coordinates = extract_coordinates_array(coordinates) super().fit(coordinates)
[docs] @timed def to_geodf(self, condition: Callable[[Cluster], bool] = None, crs='epsg:3857', include_noise=False) -> gp.GeoDataFrame: """ GeoDataFrame containing all clusters found by the model. It is a concatenation of GeoDataFrames of individual clusters :param condition: if provided, only clusters fulfilling the condition will be included :param crs: projection. By default pseudo-mercator :param include_noise: :return: GeoDataFrame with all clusters indexed by their identifier """ geodf = gp.GeoDataFrame() geodf.crs = crs for cluster in self.clusters(condition): geodf = pd.concat((geodf, cluster.to_geodf(crs=crs))) if include_noise: geodf = pd.concat((geodf, self.noise_cluster().to_geodf(crs=crs))) return geodf
[docs] def plot(self, include_noise=False, condition=None, **kwargs): """ Plots the resulting clusters with random coloring :param include_noise: Whether to include the noise cluster :param condition: If provided, only clusters fulfilling this condition will be included :param kwargs: passed to GeoDataFrame.plot :return: """ geodf = self.to_geodf(condition=condition, include_noise=include_noise) geodf["color"] = np.random.random(len(geodf)) if include_noise: geodf.loc[self.noiseLabel, "color"] = 0 geodf.plot(column="color", **kwargs)
# the overriding of the following methods is only necessary for getting the type annotations right # if mypy ever permits annotating nested classes correctly, these methods can be removed
[docs] def get_cluster(self, cluster_id: int) -> Cluster: return super().get_cluster(cluster_id)
[docs] def noise_cluster(self) -> Cluster: return super().noise_cluster()
[docs] def clusters(self, condition: Callable[[Cluster], bool] = None) -> Iterable[Cluster]: return super().clusters(condition=condition)
[docs]class SkLearnCoordinateClustering(CoordinateEuclideanClusterer): """ Wrapper around a sklearn clusterer. This class adds additional features like relabelling and convenient methods for handling geospatial data :param clusterer: a clusterer object compatible the sklearn API :param noise_label: label that is associated with the noise cluster or None :param min_cluster_size: if not None, clusters below this size will be labeled as noise :param max_cluster_size: if not None, clusters above this size will be labeled as noise """ def __init__(self, clusterer: SkLearnClustererProtocol, noise_label=-1, min_cluster_size: int = None, max_cluster_size: int = None): clusterer = SkLearnEuclideanClusterer(clusterer, noise_label=noise_label, min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size) super().__init__(clusterer)