gdf_apply

Apply or parallel apply a function to any col or row of a GeoDataFrame.

Parameters:

Name	Type	Description	Default
`gdf`	`gpd.GeoDataFramt an semantic e`	Input GeoDataFrame.	required
`func`	`Callable`	A callable function.	required
`axis`	`int`	The gdf axis to apply the function on.axis=1 means rowise. axis=0 means columnwise.	`1`
`parallel`	`bool`	Flag, whether to parallelize the operation with `pandarallel`.	`True`
`num_processes`	`int`	The number of processes to use when parallel=True. If -1, this will use all available cores.	`-1`
`pbar`	`bool`	Show progress bar when executing in parallel mode. Ignored if `parallel=False`.	`False`
`columns`	`Optional[Tuple[str, ...]]`	A tuple of column names to apply the function on. If None, this will apply the function to all columns.	`None`
`**kwargs`	`Dict[str, Any]`	Arbitrary keyword args for the `func` callable.	`{}`

Returns:

Type	Description
`GeoSeries`	gpd.GeoSeries: A GeoSeries object containing the computed values for each row or col in the input gdf.

Examples:

Get the compactness of the polygons in a gdf

>>> from histolytics.data import hgsc_cancer_nuclei
>>> from histolytics.utils.gdf import gdf_apply
>>> from histolytics.spatial_geom.morphometrics import compactness
>>> gdf = hgsc_cancer_nuclei()
>>> gdf["compactness"] = gdf_apply(
...     gdf, compactness, columns=["geometry"], parallel=True, num_processes=3
... )
                                                geometry  class_name  compactness
    0  POLYGON ((1394.01 0, 1395.01 1.99, 1398 3.99, ...  connective     0.578699
    1  POLYGON ((1391 2.01, 1387 2.01, 1384.01 3.01, ...  connective     0.947018
    2  POLYGON ((1382.99 156.01, 1380 156.01, 1376.01...  connective     0.604828

Source code in src/histolytics/utils/gdf.py

def gdf_apply(
    gdf: gpd.GeoDataFrame,
    func: Callable,
    axis: int = 1,
    parallel: bool = True,
    num_processes: Optional[int] = -1,
    pbar: bool = False,
    columns: Optional[Tuple[str, ...]] = None,
    **kwargs,
) -> gpd.GeoSeries:
    """Apply or parallel apply a function to any col or row of a GeoDataFrame.

    Parameters:
        gdf (gpd.GeoDataFramt an semantic e):
            Input GeoDataFrame.
        func (Callable):
            A callable function.
        axis (int):
            The gdf axis to apply the function on.axis=1 means rowise. axis=0
            means columnwise.
        parallel (bool):
            Flag, whether to parallelize the operation with `pandarallel`.
        num_processes (int):
            The number of processes to use when parallel=True. If -1,
            this will use all available cores.
        pbar (bool):
            Show progress bar when executing in parallel mode. Ignored if
            `parallel=False`.
        columns (Optional[Tuple[str, ...]]):
            A tuple of column names to apply the function on. If None,
            this will apply the function to all columns.
        **kwargs (Dict[str, Any]): Arbitrary keyword args for the `func` callable.

    Returns:
        gpd.GeoSeries:
            A GeoSeries object containing the computed values for each
            row or col in the input gdf.

    Examples:
        Get the compactness of the polygons in a gdf
        >>> from histolytics.data import hgsc_cancer_nuclei
        >>> from histolytics.utils.gdf import gdf_apply
        >>> from histolytics.spatial_geom.morphometrics import compactness
        >>> gdf = hgsc_cancer_nuclei()
        >>> gdf["compactness"] = gdf_apply(
        ...     gdf, compactness, columns=["geometry"], parallel=True, num_processes=3
        ... )
                                                        geometry  class_name  compactness
            0  POLYGON ((1394.01 0, 1395.01 1.99, 1398 3.99, ...  connective     0.578699
            1  POLYGON ((1391 2.01, 1387 2.01, 1384.01 3.01, ...  connective     0.947018
            2  POLYGON ((1382.99 156.01, 1380 156.01, 1376.01...  connective     0.604828
    """
    if columns is not None:
        if not isinstance(columns, (tuple, list)):
            raise ValueError(f"columns must be a tuple or list, got {type(columns)}")
        gdf = gdf[columns]

    if not parallel:
        res = gdf.apply(lambda x: func(*x, **kwargs), axis=axis)
    else:
        cpus = psutil.cpu_count(logical=False) if num_processes == -1 else num_processes
        pandarallel.initialize(verbose=1, progress_bar=pbar, nb_workers=cpus)
        res = gdf.parallel_apply(lambda x: func(*x, **kwargs), axis=axis)

    return res