Source code for thelper.data.geo.ogc

"""Data parsers & utilities module for OGC-related projects."""

import copy
import functools
import logging

import cv2 as cv
import numpy as np
import tqdm

import thelper.tasks
import thelper.utils
from thelper.data.geo.parsers import TileDataset, VectorCropDataset
from thelper.train.utils import DetectLogger

logger = logging.getLogger(__name__)


[docs]class TB15D104:
    """Wrapper class for OGC Testbed-15 (D104) identifiers."""

    TYPECE_RIVER = "10"
    TYPECE_LAKE = "21"

    BACKGROUND_ID = 0
    LAKE_ID = 1


class TB15D104Dataset(VectorCropDataset):
    """OGC Testbed-15 dataset parser for D104 (lake/river) segmentation task."""

    def __init__(self, raster_path, vector_path, px_size=None,
                 allow_outlying_vectors=True, clip_outlying_vectors=True,
                 lake_area_min=0.0, lake_area_max=float("inf"),
                 lake_river_max_dist=float("inf"), feature_buffer=1000,
                 master_roi=None, focus_lakes=True, srs_target="3857", force_parse=False,
                 reproj_rasters=False, reproj_all_cpus=True, display_debug=False,
                 keep_rasters_open=True, parallel=False, transforms=None):
        assert isinstance(lake_river_max_dist, (float, int)) and lake_river_max_dist >= 0, "unexpected dist type"
        self.lake_river_max_dist = float(lake_river_max_dist)
        assert isinstance(focus_lakes, bool), "unexpected flag type"
        self.focus_lakes = focus_lakes
        assert px_size is None or isinstance(px_size, (float, int)), "pixel size (resolution) must be float/int"
        px_size = (1.0, 1.0) if px_size is None else (float(px_size), float(px_size))
        # note: we wrap partial static functions for caching to see when internal parameters are changing
        cleaner = functools.partial(self.lake_cleaner, area_min=lake_area_min, area_max=lake_area_max,
                                    lake_river_max_dist=lake_river_max_dist, parallel=parallel)
        if self.focus_lakes:
            cropper = functools.partial(self.lake_cropper, px_size=px_size, skew=(0.0, 0.0),
                                        feature_buffer=feature_buffer, parallel=parallel)
        else:
            # TODO: implement river-focused cropper (i.e. river-length parsing?)
            raise NotImplementedError
        super().__init__(raster_path=raster_path, vector_path=vector_path, px_size=px_size, skew=None,
                         allow_outlying_vectors=allow_outlying_vectors, clip_outlying_vectors=clip_outlying_vectors,
                         vector_area_min=lake_area_min, vector_area_max=lake_area_max, vector_target_prop=None,
                         feature_buffer=feature_buffer, master_roi=master_roi, srs_target=srs_target,
                         raster_key="lidar", mask_key="hydro", cleaner=cleaner, cropper=cropper,
                         force_parse=force_parse, reproj_rasters=reproj_rasters, reproj_all_cpus=reproj_all_cpus,
                         keep_rasters_open=keep_rasters_open, transforms=transforms)
        meta_keys = self.task.meta_keys
        if "bboxes" in meta_keys:
            del meta_keys[meta_keys.index("bboxes")]  # placed in meta list by base class constr, moved to detect target below
        self.task = thelper.tasks.Detection(class_names={"background": TB15D104.BACKGROUND_ID, "lake": TB15D104.LAKE_ID},
                                            input_key="input", bboxes_key="bboxes",
                                            meta_keys=meta_keys, background=0, color_map={"lake": [255, 0, 0]})
        # update all already-created bboxes with new task ref
        for s in self.samples:
            for b in s["bboxes"]:
                b.task = self.task
        self.display_debug = display_debug
        self.parallel = parallel

    @staticmethod
    def lake_cleaner(features, area_min, area_max, lake_river_max_dist, parallel=False):
        """Flags geometric features as 'clean' based on type and distance to nearest river."""
        # note: we use a flag here instead of removing bad features so that end-users can still use them if needed
        for f in features:
            f["clean"] = False  # flag every as 'bad' by default, clear just the ones of interest below
        rivers = [f for f in features if f["properties"]["TYPECE"] == TB15D104.TYPECE_RIVER]
        lakes = [f for f in features if f["properties"]["TYPECE"] == TB15D104.TYPECE_LAKE]
        logger.info(f"labeling and cleaning {len(lakes)} lakes...")

        def clean_lake(lake):
            if area_min <= lake["geometry"].area <= area_max:
                if lake_river_max_dist == float("inf"):
                    return True
                else:
                    for river in rivers:
                        # note: distance check below seems to be "intelligent", i.e. it will
                        # first check bbox distance, then check chull distance, and finally use
                        # the full geometries (doing these steps here explicitly makes it slower)
                        if lake["geometry"].distance(river["geometry"]) < lake_river_max_dist:
                            return True
            return False

        if parallel:
            if not isinstance(parallel, int):
                import multiprocessing
                parallel = multiprocessing.cpu_count()
            assert parallel > 0, "unexpected min core count"
            import joblib
            flags = joblib.Parallel(n_jobs=parallel)(joblib.delayed(
                clean_lake)(lake) for lake in tqdm.tqdm(lakes, desc="labeling + cleaning lakes"))
            for flag, lake in zip(flags, lakes):
                lake["clean"] = flag
        else:
            for lake in tqdm.tqdm(lakes, desc="labeling + cleaning lakes"):
                lake["clean"] = clean_lake(lake)
        return features

    @staticmethod
    def lake_cropper(features, rasters_data, coverage, srs_target, px_size, skew, feature_buffer, parallel=False):
        """Returns the ROI information for a given feature (may be modified in derived classes)."""
        srs_target_wkt = srs_target.ExportToWkt()

        def crop_feature(feature):
            import thelper.data.geo as geo
            assert feature["clean"]  # should not get here with bad features
            roi, roi_tl, roi_br, crop_width, crop_height = \
                geo.utils.get_feature_roi(feature["geometry"], px_size, skew, feature_buffer)
            roi_geotransform = (roi_tl[0], px_size[0], skew[0],
                                roi_tl[1], skew[1], px_size[1])
            # test all raster regions that touch the selected feature
            raster_hits = []
            for raster_idx, raster_data in enumerate(rasters_data):
                if raster_data["target_roi"].intersects(roi):
                    raster_hits.append(raster_idx)
            # make list of all other features that may be included in the roi
            roi_centroid = feature["centroid"]
            roi_radius = np.linalg.norm(np.asarray(roi_tl) - np.asarray(roi_br)) / 2
            roi_features, bboxes = [], []
            # note: the 'image id' is in fact the id of the focal feature in the crop
            image_id = int(feature["properties"]["OBJECTID"])
            for f in features:
                # note: here, f may not be 'clean', test anyway
                if f["geometry"].distance(roi_centroid) > roi_radius:
                    continue
                inters = f["geometry"].intersection(roi)
                if inters.is_empty:
                    continue
                roi_features.append(f)
                if f["properties"]["TYPECE"] == TB15D104.TYPECE_RIVER:
                    continue
                # only lakes can generate bboxes; make sure to clip them to the roi bounds
                clip = f["clipped"] or not inters.equals(f["geometry"])
                if clip:
                    assert inters.geom_type in ["Polygon", "MultiPolygon"], "unexpected inters type"
                    corners = []
                    if inters.geom_type == "Polygon":
                        bounds = inters.bounds
                        corners.append((bounds[0:2], bounds[2:4]))
                    elif inters.geom_type == "MultiPolygon":
                        for poly in inters:
                            bounds = poly.bounds
                            corners.append((bounds[0:2], bounds[2:4]))
                else:
                    corners = [(f["tl"], f["br"])]
                for c in corners:
                    feat_tl_px = geo.utils.get_pxcoord(roi_geotransform, *c[0])
                    feat_br_px = geo.utils.get_pxcoord(roi_geotransform, *c[1])
                    bbox = [max(0, feat_tl_px[0]), max(0, feat_tl_px[1]),
                            min(crop_width - 1, feat_br_px[0]),
                            min(crop_height - 1, feat_br_px[1])]
                    if bbox[2] - bbox[0] <= 1 or bbox[3] - bbox[1] <= 1:
                        continue  # skip all bboxes smaller than 1 px (c'mon...)
                    # note: lake class id is 1 by definition
                    bboxes.append(thelper.tasks.detect.BoundingBox(TB15D104.LAKE_ID,
                                                                   bbox=bbox,
                                                                   include_margin=False,
                                                                   truncated=clip,
                                                                   image_id=image_id))
            # prepare actual 'sample' for crop generation at runtime
            return {
                "features": roi_features,
                "bboxes": bboxes,
                "focal": feature,
                "id": image_id,
                "roi": roi,
                "roi_tl": roi_tl,
                "roi_br": roi_br,
                "raster_hits": raster_hits,
                "crop_width": crop_width,
                "crop_height": crop_height,
                "geotransform": np.asarray(roi_geotransform),
                "srs": srs_target_wkt,
            }

        clean_feats = [f for f in features if f["clean"]]
        if parallel:
            if not isinstance(parallel, int):
                import multiprocessing
                parallel = multiprocessing.cpu_count()
            assert parallel > 0, "unexpected min core count"
            import joblib
            samples = joblib.Parallel(n_jobs=parallel)(joblib.delayed(
                crop_feature)(feat) for feat in tqdm.tqdm(clean_feats, desc="preparing crop regions"))
        else:
            samples = []
            for feature in tqdm.tqdm(clean_feats, desc="validating crop candidates"):
                samples.append(crop_feature(feature))
        return [s for s in samples if s is not None]

    def _show_stats_plots(self, show=False, block=False):
        """Draws and returns feature stats histograms using pyplot."""
        import matplotlib.pyplot as plt
        feature_categories = {}
        for feat in self.features:
            curr_cat = feat["properties"]["TYPECE"]
            if curr_cat not in feature_categories:
                feature_categories[curr_cat] = []
            feature_categories[curr_cat].append(feat)
        fig, axes = plt.subplots(len(feature_categories))
        for idx, (cat, features) in enumerate(feature_categories.items()):
            areas = [f["geometry"].area for f in features]
            axes[idx].hist(areas, density=True, bins=30,
                           range=(max(self.area_min, min(areas)), min(self.area_max, max(areas))))
            axes[idx].set_xlabel("Surface (m^2)")
            axes[idx].set_title(f"TYPECE = {cat}")
            axes[idx].set_xlim(xmin=0)
        if show:
            fig.show()
            if block:
                plt.show(block=block)
                return fig
            plt.pause(0.5)
        return fig, axes

    def __getitem__(self, idx):
        """Returns the data sample (a dictionary) for a specific (0-based) index."""
        if isinstance(idx, slice):
            return self._getitems(idx)
        assert idx < len(self.samples), "sample index is out-of-range"
        if idx < 0:
            idx = len(self.samples) + idx
        sample = self.samples[idx]
        crop, mask = self._process_crop(sample)
        assert crop.shape[2] == 1, "unexpected lidar raster band count"
        crop = crop[:, :, 0]
        dmap = cv.distanceTransform(np.where(mask, np.uint8(0), np.uint8(255)), cv.DIST_L2, cv.DIST_MASK_PRECISE)
        dmap_inv = cv.distanceTransform(np.where(mask, np.uint8(255), np.uint8(0)), cv.DIST_L2, cv.DIST_MASK_PRECISE)
        dmap = np.where(mask, -dmap_inv, dmap)
        dmap *= self.px_size[0]  # constructor enforces same px width/height size
        # dc mask is crop.mask, but most likely lost below
        if self.display_debug:
            crop = cv.normalize(crop, dst=crop, alpha=0, beta=255, norm_type=cv.NORM_MINMAX,
                                dtype=cv.CV_8U, mask=(~crop.mask).astype(np.uint8))
            mask = cv.normalize(mask, dst=mask, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
            dmap = cv.normalize(dmap, dst=dmap, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
        sample = {
            "input": np.stack([crop, mask, dmap], axis=-1),
            # note: bboxes are automatically added in the "cropper" preprocessing function
            "hydro": mask,
            **sample
        }
        if self.transforms:
            sample = self.transforms(sample)
        return sample


class TB15D104TileDataset(TileDataset):
    """OGC Testbed-15 dataset parser for D104 (lake/river) segmentation task."""

    def __init__(self, raster_path, vector_path, tile_size, tile_overlap,
                 px_size=None, allow_outlying_vectors=True, clip_outlying_vectors=True,
                 lake_area_min=0.0, lake_area_max=float("inf"), master_roi=None, srs_target="3857",
                 force_parse=False, reproj_rasters=False, reproj_all_cpus=True, display_debug=False,
                 keep_rasters_open=True, parallel=False, transforms=None):
        assert px_size is None or isinstance(px_size, (float, int)), "pixel size (resolution) must be float/int"
        px_size = (1.0, 1.0) if px_size is None else (float(px_size), float(px_size))
        # note: we wrap partial static functions for caching to see when internal parameters are changing
        cleaner = functools.partial(TB15D104Dataset.lake_cleaner, area_min=lake_area_min, area_max=lake_area_max,
                                    lake_river_max_dist=float("inf"), parallel=parallel)
        super().__init__(raster_path=raster_path, vector_path=vector_path, tile_size=tile_size,
                         tile_overlap=tile_overlap, skip_empty_tiles=True, skip_nodata_tiles=False,
                         px_size=px_size, allow_outlying_vectors=allow_outlying_vectors,
                         clip_outlying_vectors=clip_outlying_vectors, vector_area_min=lake_area_min,
                         vector_area_max=lake_area_max, vector_target_prop=None, master_roi=master_roi,
                         srs_target=srs_target, raster_key="lidar", mask_key="hydro", cleaner=cleaner,
                         force_parse=force_parse, reproj_rasters=reproj_rasters, reproj_all_cpus=reproj_all_cpus,
                         keep_rasters_open=keep_rasters_open, transforms=transforms)
        meta_keys = self.task.meta_keys
        self.task = thelper.tasks.Detection(class_names={"background": TB15D104.BACKGROUND_ID, "lake": TB15D104.LAKE_ID},
                                            input_key="input", bboxes_key="bboxes",
                                            meta_keys=meta_keys, background=0, color_map={"lake": [255, 0, 0]})
        self.display_debug = display_debug
        self.parallel = parallel

    def __getitem__(self, idx):
        """Returns the data sample (a dictionary) for a specific (0-based) index."""
        if isinstance(idx, slice):
            return self._getitems(idx)
        import thelper.data.geo as geo
        assert idx < len(self.samples), "sample index is out-of-range"
        if idx < 0:
            idx = len(self.samples) + idx
        sample = self.samples[idx]
        crop, mask = self._process_crop(sample)
        assert crop.shape[2] == 1, "unexpected lidar raster band count"
        crop = crop[:, :, 0]
        dmap = cv.distanceTransform(np.where(mask, np.uint8(0), np.uint8(255)), cv.DIST_L2, cv.DIST_MASK_PRECISE)
        dmap_inv = cv.distanceTransform(np.where(mask, np.uint8(255), np.uint8(0)), cv.DIST_L2, cv.DIST_MASK_PRECISE)
        dmap = np.where(mask, -dmap_inv, dmap)
        dmap *= self.px_size[0]  # constructor enforces same px width/height size
        # dc mask is crop.mask, but most likely lost below
        if self.display_debug:
            crop = cv.normalize(crop, dst=crop, alpha=0, beta=255, norm_type=cv.NORM_MINMAX,
                                dtype=cv.CV_8U, mask=(~crop.mask).astype(np.uint8))
            mask = cv.normalize(mask, dst=mask, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
            dmap = cv.normalize(dmap, dst=dmap, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
        # note1: contrarily to the 'smart' dataset parser above, we need to add bboxes here
        # note2: bboxes should only be added over areas that are not 'nodata'
        bboxes = []
        for f in sample["features"]:
            if f["properties"]["TYPECE"] == TB15D104.TYPECE_RIVER:
                continue
            # only lakes can generate bboxes; make sure to clip them to the crop bounds
            inters = f["geometry"].intersection(sample["roi"])
            clip = f["clipped"] or not inters.equals(f["geometry"])
            if clip:
                assert inters.geom_type in ["Polygon", "MultiPolygon"], "unexpected inters type"
                corners = []
                if inters.geom_type == "Polygon":
                    bounds = inters.bounds
                    corners.append((bounds[0:2], bounds[2:4]))
                elif inters.geom_type == "MultiPolygon":
                    for poly in inters:
                        bounds = poly.bounds
                        corners.append((bounds[0:2], bounds[2:4]))
            else:
                corners = [(f["tl"], f["br"])]
            for c in corners:
                feat_tl_px = geo.utils.get_pxcoord(sample["geotransform"], *c[0])
                feat_br_px = geo.utils.get_pxcoord(sample["geotransform"], *c[1])
                bbox = [max(0, feat_tl_px[0]), max(0, feat_tl_px[1]),
                        min(sample["crop_width"] - 1, feat_br_px[0]),
                        min(sample["crop_height"] - 1, feat_br_px[1])]
                if bbox[2] - bbox[0] <= 1 or bbox[3] - bbox[1] <= 1:
                    continue  # skip all bboxes smaller than 1 px (c'mon...)
                # note: lake class id is 1 by definition
                bboxes.append(thelper.tasks.detect.BoundingBox(TB15D104.LAKE_ID,
                                                               bbox=bbox,
                                                               include_margin=False,
                                                               truncated=clip,
                                                               image_id=sample["id"],
                                                               task=self.task))
        sample = {
            "input": np.stack([crop, mask, dmap], axis=-1),
            "hydro": mask,
            "bboxes": bboxes,
            **sample
        }
        if self.transforms:
            sample = self.transforms(sample)
        return sample


[docs]class TB15D104DetectLogger(DetectLogger):

[docs]    def __init__(self, conf_threshold=0.5):
        super().__init__(conf_threshold=conf_threshold, target_name="lake",
                         log_keys=["id", "geotransform"], format="geojson")

[docs]    def report_geojson(self):
        # here, we only care about reporting predictions, we ignore the (possibly missing) gt bboxes
        import shapely
        import geojson
        import thelper.data.geo as geo
        batch_size = len(self.bbox[0])
        bbox_lists = [bboxes for batch in self.bbox for bboxes in batch]  # one list per crop
        if batch_size > 1:
            geotransforms = [np.asarray(geot) for batch in self.meta["geotransform"] for geot in batch]
        else:
            # special check to avoid issues when unpacking (1,6)-dim tensor
            geotransforms = [np.asarray(geot) for geot in self.meta["geotransform"]]
        crop_ids = [id for batch in self.meta["id"] for id in batch]
        output_features = []
        for bboxes, geotransform, id in zip(bbox_lists, geotransforms, crop_ids):
            for bbox in bboxes:
                if geotransform.shape[0] == 1:
                    geotransform = geotransform[0]
                bbox_tl = geo.utils.get_geocoord(geotransform, *bbox.top_left)
                bbox_br = geo.utils.get_geocoord(geotransform, *bbox.bottom_right)
                bbox_geom = shapely.geometry.Polygon([bbox_tl, (bbox_br[0], bbox_tl[1]),
                                                      bbox_br, (bbox_tl[0], bbox_br[1])])
                output_features.append(geojson.Feature(geometry=bbox_geom, properties={
                    "image_id": id, "confidence": bbox.confidence}))
        return geojson.dumps(geojson.FeatureCollection(output_features), indent=2)


[docs]def postproc_features(input_file, bboxes_srs, orig_geoms_path, output_file,
                      final_srs=None, write_shapefile_copy=False):
    """Post-processes bounding box detections produced during an evaluation session into a GeoJSON file."""
    import ogr
    import osr
    import json
    import geojson
    import shapely
    import thelper.data.geo as geo
    logger.debug("importing bboxes SRS...")
    assert isinstance(bboxes_srs, (str, int, osr.SpatialReference)), \
        "target EPSG SRS must be given as int/str"
    if isinstance(bboxes_srs, (str, int)):
        if isinstance(bboxes_srs, str):
            bboxes_srs = int(bboxes_srs.replace("EPSG:", ""))
        bboxes_srs_obj = osr.SpatialReference()
        bboxes_srs_obj.ImportFromEPSG(bboxes_srs)
        bboxes_srs = bboxes_srs_obj
    logger.debug("importing lake bboxes geojson...")
    with open(input_file) as bboxes_fd:
        bboxes_geoms = geo.utils.parse_geojson(json.load(bboxes_fd))
    logger.debug("importing hydro features geojson...")
    with open(orig_geoms_path) as hydro_fd:
        hydro_geoms = geo.utils.parse_geojson(json.load(hydro_fd), srs_target=bboxes_srs)
    logger.debug("computing global cascade of lake bboxes...")
    detect_roi = shapely.ops.cascaded_union([bbox["geometry"] for bbox in bboxes_geoms])
    output_features = []

    def append_poly(feat, props, srs_transform=None):
        if feat.is_empty:
            return
        elif feat.type == "Polygon":
            if srs_transform is not None:
                ogr_geometry = ogr.CreateGeometryFromWkb(feat.wkb)
                ogr_geometry.Transform(srs_transform)
                feat = shapely.wkt.loads(ogr_geometry.ExportToWkt())
            output_features.append((geojson.Feature(geometry=feat, properties=props), feat))
        elif feat.type == "MultiPolygon" or feat.type == "GeometryCollection":
            for f in feat:
                append_poly(f, props)

    srs_transform = None
    if final_srs is not None:
        import osr
        logger.debug("importing output SRS...")
        assert isinstance(final_srs, (str, int, osr.SpatialReference)), \
            "target EPSG SRS must be given as int/str"
        if isinstance(final_srs, (str, int)):
            if isinstance(final_srs, str):
                final_srs = int(final_srs.replace("EPSG:", ""))
            final_srs_obj = osr.SpatialReference()
            final_srs_obj.ImportFromEPSG(final_srs)
            final_srs = final_srs_obj
        if not bboxes_srs.IsSame(final_srs):
            srs_transform = osr.CoordinateTransformation(bboxes_srs, final_srs)
    logger.debug("running hydro feature and lake bboxes intersection loop...")
    for hydro_feat in tqdm.tqdm(hydro_geoms, desc="computing bbox intersections"):
        # find intersection and append to list of 'lakes'
        intersection = hydro_feat["geometry"].intersection(detect_roi)
        hydro_feat["properties"]["TYPECE"] = TB15D104.TYPECE_LAKE
        append_poly(intersection, copy.deepcopy(hydro_feat["properties"]), srs_transform)
        if not intersection.is_empty:
            # subtract bbox region from feature if intersection found (leftovers at end will be 'rivers')
            hydro_feat["geometry"] = hydro_feat["geometry"].difference(detect_roi)
    logger.debug("running river cleanup loop...")
    for hydro_feat in tqdm.tqdm(hydro_geoms, desc="appending leftover geometries as rivers"):
        if not hydro_feat["geometry"].is_empty:
            # remark: hydro features outside the original ROI will appear as rivers despite never being processed
            hydro_feat["properties"]["TYPECE"] = TB15D104.TYPECE_RIVER
            append_poly(hydro_feat["geometry"], copy.deepcopy(hydro_feat["properties"]), srs_transform)
    logger.debug("exporting final geojson...")
    with open(output_file, "w") as fd:
        out_srs = final_srs if final_srs is not None else bboxes_srs
        fd.write(geo.utils.export_geojson_with_crs([o[0] for o in output_features], srs_target=out_srs))
    if write_shapefile_copy:
        logger.debug("exporting final shapefile...")
        driver = ogr.GetDriverByName("ESRI Shapefile")
        data_source = driver.CreateDataSource(output_file + ".shp")
        layer = data_source.CreateLayer("lakes", final_srs if final_srs is not None else bboxes_srs, ogr.wkbPolygon)
        for feat_tuple in output_features:
            feature = ogr.Feature(layer.GetLayerDefn())
            point = ogr.CreateGeometryFromWkt(feat_tuple[1].wkt)
            feature.SetGeometry(point)
            layer.CreateFeature(feature)
            feature = None  # noqa # flush
        data_source = None  # noqa # flush
Navigation

Source code for thelper.data.geo.ogc

Navigation