Source code for cvpods.data.detection_utils

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by BaseDetection, Inc. and its affiliates.

import logging

import cv2
import numpy as np
from PIL import Image
import pycocotools.mask as mask_util

import torch

from cvpods.structures import (
    BitMasks,
    Boxes,
    BoxMode,
    Instances,
    Keypoints,
    PolygonMasks,
    RotatedBoxes,
    polygons_to_bitmask
)
from cvpods.utils import PathManager

from . import transforms as T


[docs]class SizeMismatchError(ValueError): """ When loaded image has difference width/height compared with annotation. """
# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601 _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]] _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]] # https://www.exiv2.org/tags.html _EXIF_ORIENT = 274 # exif 'Orientation' tag
[docs]def convert_PIL_to_numpy(image, format): """ Convert PIL image to numpy array of target format. Args: image (PIL.Image): a PIL image format (str): the format of output image Returns: (np.ndarray): also see `read_image` """ if format is not None: # PIL only supports RGB, so convert to RGB and flip channels over below conversion_format = format if format in ["BGR", "YUV-BT.601"]: conversion_format = "RGB" image = image.convert(conversion_format) image = np.asarray(image) # PIL squeezes out the channel dimension for "L", so make it HWC if format == "L": image = np.expand_dims(image, -1) # handle formats not supported by PIL elif format == "BGR": # flip channels if needed image = image[:, :, ::-1] elif format == "YUV-BT.601": image = image / 255.0 image = np.dot(image, np.array(_M_RGB2YUV).T) return image
[docs]def convert_image_to_rgb(image, format): """ Convert an image from given format to RGB. Args: image (np.ndarray or Tensor): an HWC image format (str): the format of input image, also see `read_image` Returns: (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8 """ if isinstance(image, torch.Tensor): image = image.cpu().numpy() if format == "BGR": image = image[:, :, [2, 1, 0]] elif format == "YUV-BT.601": image = np.dot(image, np.array(_M_YUV2RGB).T) image = image * 255.0 else: if format == "L": image = image[:, :, 0] image = image.astype(np.uint8) image = np.asarray(Image.fromarray(image, mode=format).convert("RGB")) return image
def _apply_exif_orientation(image): """ Applies the exif orientation correctly. This code exists per the bug: https://github.com/python-pillow/Pillow/issues/3973 with the function `ImageOps.exif_transpose`. The Pillow source raises errors with various methods, especially `tobytes` Function based on: https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59 https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527 Args: image (PIL.Image): a PIL image Returns: (PIL.Image): the PIL image with exif orientation applied, if applicable """ if not hasattr(image, "getexif"): return image try: exif = image.getexif() except Exception: # https://github.com/facebookresearch/detectron2/issues/1885 exif = None if exif is None: return image orientation = exif.get(_EXIF_ORIENT) method = { 2: Image.FLIP_LEFT_RIGHT, 3: Image.ROTATE_180, 4: Image.FLIP_TOP_BOTTOM, 5: Image.TRANSPOSE, 6: Image.ROTATE_270, 7: Image.TRANSVERSE, 8: Image.ROTATE_90, }.get(orientation) if method is not None: return image.transpose(method) return image
[docs]def read_image(file_name, format=None): """ Read an image into the given format. Will apply rotation and flipping if the image has such exif information. Args: file_name (str): image file path format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601". Returns: image (np.ndarray): an HWC image in the given format, which is 0-255, uint8 for supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601. """ with PathManager.open(file_name, "rb") as f: image = Image.open(f) if format == ("RGB"): image = image.convert("RGB") return np.array(image) else: # work around this bug: https://github.com/python-pillow/Pillow/issues/3973 image = _apply_exif_orientation(image) return convert_PIL_to_numpy(image, format)
[docs]def check_image_size(dataset_dict, image): """ Raise an error if the image does not match the size specified in the dict. """ if "width" in dataset_dict or "height" in dataset_dict: image_wh = (image.shape[1], image.shape[0]) expected_wh = (dataset_dict["width"], dataset_dict["height"]) if not image_wh == expected_wh: raise SizeMismatchError( "Mismatched image shape{}, got {}, expect {}.".format( " for image " + dataset_dict["file_name"] if "file_name" in dataset_dict else "", image_wh, expected_wh, ) + " Please check the width/height in your annotation." ) # To ensure bbox always remap to original image size if "width" not in dataset_dict: dataset_dict["width"] = image.shape[1] if "height" not in dataset_dict: dataset_dict["height"] = image.shape[0]
[docs]def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk): """ Apply transformations to the proposals in dataset_dict, if any. Args: dataset_dict (dict): a dict read from the dataset, possibly contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" image_shape (tuple): height, width transforms (TransformList): min_box_side_len (int): keep proposals with at least this size proposal_topk (int): only keep top-K scoring proposals The input dict is modified in-place, with abovementioned keys removed. A new key "proposals" will be added. Its value is an `Instances` object which contains the transformed proposals in its field "proposal_boxes" and "objectness_logits". """ if "proposal_boxes" in dataset_dict: # Transform proposal boxes boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, )) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32")) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_side_len) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals
[docs]def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = np.array([obj.get("keypoints", []) for obj in annos]) # (N, K, 3) # Set all out-of-boundary points to "unlabeled" kpts_xy = kpts[:, :, :2] inside = (kpts_xy >= np.array([0, 0])) & (kpts_xy <= np.array(image_size[::-1])) inside = inside.all(axis=2) kpts[:, :, :2] = kpts_xy kpts[:, :, 2][~inside] = 0 target.gt_keypoints = Keypoints(kpts) return target
[docs]def annotations_to_instances_rotated(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Compared to `annotations_to_instances`, this function is for rotated boxes only Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: Containing fields "gt_boxes", "gt_classes", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [obj["bbox"] for obj in annos] target = Instances(image_size) boxes = target.gt_boxes = RotatedBoxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes return target
[docs]def filter_empty_instances(instances, by_box=True, by_mask=True): """ Filter out empty instances in an `Instances` object. Args: instances (Instances): by_box (bool): whether to filter out instances with empty boxes by_mask (bool): whether to filter out instances with empty masks Returns: Instances: the filtered instances. """ assert by_box or by_mask r = [] if by_box: r.append(instances.gt_boxes.nonempty()) if instances.has("gt_masks") and by_mask: r.append(instances.gt_masks.nonempty()) # TODO: can also filter visible keypoints if not r: return instances m = r[0] for x in r[1:]: m = m & x return instances[m]
[docs]def create_keypoint_hflip_indices(dataset_names, meta): """ Args: dataset_names (list[str]): list of dataset names Returns: ndarray[int]: a vector of size=#keypoints, storing the horizontally-flipped keypoint indices. """ check_metadata_consistency("keypoint_names", dataset_names, meta) check_metadata_consistency("keypoint_flip_map", dataset_names, meta) names = meta["keypoint_names"] # TODO flip -> hflip flip_map = dict(meta["keypoint_flip_map"]) flip_map.update({v: k for k, v in flip_map.items()}) flipped_names = [i if i not in flip_map else flip_map[i] for i in names] flip_indices = [names.index(i) for i in flipped_names] return np.asarray(flip_indices)
[docs]def gen_crop_transform_with_instance(crop_size, image_size, instance): """ Generate a CropTransform so that the cropping region contains the center of the given instance. Args: crop_size (tuple): h, w in pixels image_size (tuple): h, w instance (dict): an annotation dict of one instance, in cvpods's dataset format. """ crop_size = np.asarray(crop_size, dtype=np.int32) bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS) center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5 assert (image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1] ), "The annotation bounding box is outside of the image!" assert (image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1] ), "Crop size is larger than image size!" min_yx = np.maximum(np.ceil(center_yx).astype(np.int32) - crop_size, 0) max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0) max_yx = np.minimum(max_yx, np.floor(center_yx).astype(np.int32)) y0 = np.random.randint(min_yx[0], max_yx[0] + 1) x0 = np.random.randint(min_yx[1], max_yx[1] + 1) return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
[docs]def check_metadata_consistency(key, dataset_names, meta): """ Check that the datasets have consistent metadata. Args: key (str): a metadata key dataset_names (list[str]): a list of dataset names Raises: AttributeError: if the key does not exist in the metadata ValueError: if the given datasets do not have the same metadata values defined by key """ if len(dataset_names) == 0: return logger = logging.getLogger(__name__) entries_per_dataset = [meta.get(key) for d in dataset_names] for idx, entry in enumerate(entries_per_dataset): if entry != entries_per_dataset[0]: logger.error("Metadata '{}' for dataset '{}' is '{}'".format( key, dataset_names[idx], str(entry))) logger.error("Metadata '{}' for dataset '{}' is '{}'".format( key, dataset_names[0], str(entries_per_dataset[0]))) raise ValueError( "Datasets have different metadata '{}'!".format(key))
[docs]def check_sample_valid(args): if args["sample_style"] == "range": assert ( len(args["short_edge_length"]) == 2 ), f"more than 2 ({len(args['short_edge_length'])}) " \ "short_edge_length(s) are provided for ranges"
[docs]def imdecode(data, *, require_chl3=True, require_alpha=False): """decode images in common formats (jpg, png, etc.) :param data: encoded image data :type data: :class:`bytes` :param require_chl3: whether to convert gray image to 3-channel BGR image :param require_alpha: whether to add alpha channel to BGR image :rtype: :class:`numpy.ndarray` """ img = cv2.imdecode(np.fromstring(data, np.uint8), cv2.IMREAD_UNCHANGED) if img is None and len(data) >= 3 and data[:3] == b'GIF': # cv2 doesn't support GIF, try PIL img = _gif_decode(data) assert img is not None, 'failed to decode' if img.ndim == 2 and require_chl3: img = img.reshape(img.shape + (1, )) if img.shape[2] == 1 and require_chl3: img = np.tile(img, (1, 1, 3)) if img.ndim == 3 and img.shape[2] == 3 and require_alpha: assert img.dtype == np.uint8 img = np.concatenate([img, np.ones_like(img[:, :, :1]) * 255], axis=2) return img
def _gif_decode(data): try: import io from PIL import Image im = Image.open(io.BytesIO(data)) im = im.convert('RGB') return cv2.cvtColor(np.array(im), cv2.COLOR_RGB2BGR) except Exception: return