Source code for multimodal.features.bottomup

"""
Vision features for muldimodal tasks like Image Captioning, VQA or image retrieval
"""
# std
import os
import zipfile
import csv
import base64
import pickle
import sys

# packages
from tqdm import tqdm
import numpy as np
from pySmartDL import SmartDL
import tables as tb

# multimodal
from multimodal import DEFAULT_DATA_DIR

csv.field_size_limit(sys.maxsize)
FIELDNAMES = ["image_id", "image_w", "image_h", "num_boxes", "boxes", "features"]


class Metadata(tb.IsDescription):
    image_id = tb.Int32Col()
    image_h = tb.Int32Col()
    image_w = tb.Int32Col()
    num_boxes = tb.Int32Col()
    start_position = tb.Int32Col()


[docs]class COCOBottomUpFeatures: """ Bottom up features for the COCO dataset Args: features (str): one of [``trainval2014_36``, ``trainval2014``, ``test2014_36``, ``test2014``, ``test2015-36``, ``test2015``]. Specifies the split, and the number of detected objects. _36 means 36 objetcs are detected in every image, and otherwise, the number is based on a detection threshold, between 10 and 100 objects. dir_data (str): Directory where multimodal data will be downloaded. You need at least 60Go for downloading and extracting the features. """ name = "coco-bottom-up" urls = { "trainval2014_36": "https://imagecaption.blob.core.windows.net/imagecaption/trainval_36.zip", # trainval2014 "test2015_36": "https://imagecaption.blob.core.windows.net/imagecaption/test2015_36.zip", "test2014_36": "https://imagecaption.blob.core.windows.net/imagecaption/test2014_36.zip", "trainval2014": "https://imagecaption.blob.core.windows.net/imagecaption/trainval.zip", # trainval2014 "test2015": "https://imagecaption.blob.core.windows.net/imagecaption/test2015.zip", "test2014": "https://imagecaption.blob.core.windows.net/imagecaption/test2014.zip", } tsv_paths = { "trainval2014_36": ["trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv"], "test2015_36": ["test2015_36/test2014_resnet101_faster_rcnn_genome_36.tsv"], "test2014_36": ["test2014_36/test2014_resnet101_faster_rcnn_genome_36.tsv"], "trainval2014": [ "trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0", "trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.1", "trainval/karpathy_test_resnet101_faster_rcnn_genome.tsv", "trainval/karpathy_val_resnet101_faster_rcnn_genome.tsv", ], "test2015": ["test2015/test2014_resnet101_faster_rcnn_genome.tsv"], "test2014": ["test2014/test2014_resnet101_faster_rcnn_genome.tsv"], } def __init__(self, features: str, dir_data: str=None): self.features_name = features self.db = None # Lazy loading of zipfile dir_data = dir_data or DEFAULT_DATA_DIR self.dir_data = os.path.join(dir_data, "features", self.name) os.makedirs(self.dir_data, exist_ok=True) self.featspath = os.path.join(self.dir_data, features + ".tables") # processing if not os.path.exists(self.featspath): path_download = self.download() print("Processing file") self._process_file(path_download, self.featspath) @classmethod def download_and_process(cls, name, dir_data): cls(feature=name, dir_data=dir_data) def download(self): url = self.urls[self.features_name] dl = SmartDL(url, self.dir_data) destination = dl.get_dest() if not os.path.exists(dl.get_dest()): dl.start() return destination
[docs] def __getitem__(self, image_id: int): """ Get the features. Args: image_id (str|int): The id of the image in COCO dataset. Returns: A dictionnary containing the following keys:: { 'image_id', 'image_h': height 'image_w': width 'num_boxes': number of objects 'boxes': Numpy array of shape (N, 4) containing bounding box coordinates 'features': Numpy array of shape (N, 2048) containing features. } """ self._check_open() data = self.db.metadata.read_where(f"image_id=={image_id}")[0] start_position = data["start_position"] num_boxes = data["num_boxes"] data = { field: data[field] for field in ["image_id", "image_w", "image_h", "num_boxes"] } features = self.db.features[start_position : start_position + num_boxes] boxes = self.db.boxes[start_position : start_position + num_boxes] data["features"] = features data["boxes"] = boxes return data
def _check_open(self): if self.db is None: self.db = tb.open_file(self.featspath).root
[docs] def keys(self): """ Returns: list: List of all keys """ self._check_open() return list(self.db.metadata.read(field="image_id"))
def _process_file(self, path_infile: str, outpath: str): directory = os.path.dirname(path_infile) tsv_paths = self.tsv_paths[self.features_name] if type(tsv_paths) == str: tsv_paths = [tsv_paths] tsv_paths = [os.path.join(directory, path) for path in tsv_paths] last_tsv = tsv_paths[-1] try: if not os.path.exists(last_tsv): print(f"Unzipping file at {path_infile}") with zipfile.ZipFile(path_infile, "r") as zip_ref: zip_ref.extractall(directory) names = set() num_duplicates = 0 print(f"Processing files {tsv_paths}") except Exception: os.remove(os.path.join(self.dir_data, self.features_name)) raise try: outfile = tb.open_file(outpath, mode="w") except Exception: os.remove(outpath) raise try: table = outfile.create_table( outfile.root, "metadata", Metadata, expectedrows=123287 ) array_feats = outfile.create_earray( outfile.root, "features", shape=(0, 2048), atom=tb.Float32Atom(), expectedrows=36 * 123287, ) array_boxes = outfile.create_earray( outfile.root, "boxes", shape=(0, 4), atom=tb.Float32Atom(), expectedrows=36 * 123287, ) feat = table.row table.cols.image_id.create_index() pbar = tqdm(total=123287, desc="Converting features to PyTables") for tsv_p in tsv_paths: with open(tsv_p, "r") as tsv_in_file: reader = csv.DictReader( tsv_in_file, delimiter="\t", fieldnames=FIELDNAMES ) start_position = 0 for item in reader: pbar.update(1) num_boxes = int(item["num_boxes"]) feat["image_id"] = int(item["image_id"]) feat["image_h"] = int(item["image_h"]) feat["image_w"] = int(item["image_w"]) feat["num_boxes"] = num_boxes feat["start_position"] = start_position start_position += int(item["num_boxes"]) if item["image_id"] in names: print(f"Duplicate {item['image_id']}") num_duplicates += 1 continue for field in ["boxes", "features"]: item[field] = np.frombuffer( base64.decodebytes(item[field].encode("ascii")), dtype=np.float32, ).reshape((num_boxes, -1)) array_boxes.append(item["boxes"]) array_feats.append(item["features"]) feat.append() table.flush() print(f"Num duplicates : {num_duplicates}") outfile.close() pbar.close() except Exception: outfile.close() os.remove(outpath) raise # remove tsv print("Deleting tsv from disk")
# os.remove(tsv_path)