Source code for cvpods.data.datasets.voc

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by BaseDetection, Inc. and its affiliates.

import copy
import logging
import os
import os.path as osp
import xml.etree.ElementTree as ET

import numpy as np

import torch

from cvpods.structures import BoxMode
from cvpods.utils import PathManager

from ..base_dataset import BaseDataset
from ..detection_utils import (
    annotations_to_instances,
    check_image_size,
    create_keypoint_hflip_indices,
    filter_empty_instances,
    read_image
)
from ..registry import DATASETS
from .paths_route import _PREDEFINED_SPLITS_VOC

"""
This file contains functions to parse ImageNet-format annotations into dicts in "cvpods format".
"""

logger = logging.getLogger(__name__)


[docs]@DATASETS.register() class VOCDataset(BaseDataset): def __init__(self, cfg, dataset_name, transforms=[], is_train=True): super(VOCDataset, self).__init__(cfg, dataset_name, transforms, is_train) image_root, split = _PREDEFINED_SPLITS_VOC["voc"][self.name] self.image_root = osp.join(self.data_root, image_root) \ if "://" not in image_root else image_root self.split = split self.meta = self._get_metadata() self.dataset_dicts = self._load_annotations() # fmt: off self.data_format = cfg.INPUT.FORMAT self.mask_on = cfg.MODEL.MASK_ON self.mask_format = cfg.INPUT.MASK_FORMAT self.filter_empty = cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS self.keypoint_on = cfg.MODEL.KEYPOINT_ON self.load_proposals = cfg.MODEL.LOAD_PROPOSALS self.proposal_files = cfg.DATASETS.PROPOSAL_FILES_TRAIN # fmt: on if is_train: self.dataset_dicts = self._filter_annotations( filter_empty=self.filter_empty, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if self.keypoint_on else 0, proposal_files=self.proposal_files if self.load_proposals else None, ) self._set_group_flag() self.eval_with_gt = cfg.TEST.get("WITH_GT", False) if self.keypoint_on: # Flip only makes sense in training self.keypoint_hflip_indices = create_keypoint_hflip_indices( cfg.DATASETS.TRAIN) else: self.keypoint_hflip_indices = None
[docs] def __getitem__(self, index): """Load data, apply transforms, converto to Instances. """ dataset_dict = copy.deepcopy(self.dataset_dicts[index]) # read image image = read_image(dataset_dict["file_name"], format=self.data_format) check_image_size(dataset_dict, image) if "annotations" in dataset_dict: annotations = dataset_dict.pop("annotations") annotations = [ ann for ann in annotations if ann.get("iscrowd", 0) == 0] else: annotations = None # apply transfrom image, annotations = self._apply_transforms( image, annotations) if annotations is not None: image_shape = image.shape[:2] # h, w instances = annotations_to_instances( annotations, image_shape, mask_format=self.mask_format ) # # Create a tight bounding box from masks, useful when image is cropped # if self.crop_gen and instances.has("gt_masks"): # instances.gt_boxes = instances.gt_masks.get_bounding_boxes() dataset_dict["instances"] = filter_empty_instances(instances) # convert to Instance type # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. # h, w, c -> c, h, w dataset_dict["image"] = torch.as_tensor( np.ascontiguousarray(image.transpose(2, 0, 1))) return dataset_dict
def __len__(self): return len(self.dataset_dicts) def _get_metadata(self): # fmt: off thing_classes = [ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ] meta = { "thing_classes": thing_classes, "evaluator_type": _PREDEFINED_SPLITS_VOC["evaluator_type"]["voc"], "dirname": self.image_root, "split": self.split, "year": 2007, } return meta def _load_annotations(self): """ Load Pascal VOC detection annotations to cvpods format. Args: dirname: Contain "Annotations", "ImageSets", "JPEGImages" split (str): one of "train", "test", "val", "trainval" """ dirname = self.image_root split = self.split with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: fileids = np.loadtxt(f, dtype=np.str) dicts = [] for fileid in fileids: anno_file = os.path.join(dirname, "Annotations", fileid + ".xml") jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") tree = ET.parse(anno_file) r = { "file_name": jpeg_file, "image_id": fileid, "height": int(tree.findall("./size/height")[0].text), "width": int(tree.findall("./size/width")[0].text), } instances = [] for obj in tree.findall("object"): cls = obj.find("name").text # We include "difficult" samples in training. # Based on limited experiments, they don't hurt accuracy. # difficult = int(obj.find("difficult").text) # if difficult == 1: # continue bbox = obj.find("bndbox") bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] # Original annotations are integers in the range [1, W or H] # Assuming they mean 1-based pixel indices (inclusive), # a box with annotation (xmin=1, xmax=W) covers the whole image. # In coordinate space this is represented by (xmin=0, xmax=W) bbox[0] -= 1.0 bbox[1] -= 1.0 instances.append({ "category_id": CLASS_NAMES.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS }) r["annotations"] = instances dicts.append(r) return dicts
# fmt: off CLASS_NAMES = [ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", ]