Source code for multimodal.datasets.clevr

from genericpath import exists
import os
import json

from multimodal.utils import Task, download_and_unzip
from multimodal import DEFAULT_DATA_DIR

from torch.utils.data import Dataset
from PIL import Image
import torchvision
import torch


[docs]class CLEVR(Dataset):
    """
    CLEVR: A Diagnostic Dataset for
    Compositional Language and Elementary Visual Reasoning.
    
    See https://cs.stanford.edu/people/jcjohns/clevr/

    Warning: instanciating this class will download a 18Gb file to the multimodal data directory
    (by default in your applications data). You can specify 
    the multimodal data directory by specifying the ``dir_data`` argument, or specifying it in your path.

    Args:
        dir_data (str): dir for the multimodal cache (data will be downloaded in a clevr/ folder inside this directory
        split (str): either train, val or test
        transform: torchvision transform applied to images. By default, only ToTensor.
    """

    url = "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip"

    def __init__(
        self, dir_data=None, split="train", transform=torchvision.transforms.ToTensor()
    ):
        super().__init__()
        if dir_data is None:
            dir_data = DEFAULT_DATA_DIR
        self.split = split
        self.dir_dataset = os.path.join(dir_data, "datasets", "clevr")
        self.transform = transform
        os.makedirs(self.dir_dataset, exist_ok=True)
        self.download_and_process(self.dir_dataset)
        # open data
        with open(self._get_path_questions(self.dir_dataset, self.split)) as f:
            self.questions = json.load(f)["questions"]

        with open(os.path.join(self.dir_dataset, "answers.json")) as f:
            self.aid_to_ans = json.load(f)
        self.ans_to_aid = {ans: id for (id, ans) in enumerate(self.aid_to_ans)}

    @classmethod
    def _get_path_questions(cls, dir_dataset, split):
        return os.path.join(
            dir_dataset, f"CLEVR_v1.0/questions/CLEVR_{split}_questions.json"
        )

    @classmethod
    def download_and_process(cls, dir_dataset):
        task = Task(dir_dataset, "download")
        if not task.is_done():
            print("Downloading CLEVR")
            download_and_unzip(cls.url, directory=dir_dataset)
        task.mark_done()
        path_answers = os.path.join(dir_dataset, "answers.json")
        if not os.path.exists(path_answers):
            print("Processing answers")
            with open(cls._get_path_questions(dir_dataset, split="train")) as f:
                train_questions = json.load(f)["questions"]
                all_answers = list(set(q["answer"] for q in train_questions))
                with open(path_answers, "w") as f:
                    json.dump(all_answers, f)

[docs]    def __getitem__(self, index: int):
        """
        Returns a dictionnary with the following keys:

        .. code-block::

            {
                "index",
                "question",
                "answer":,
                "question_family_index":,
                "image_filename":,
                "image_index":,
                "image"
                "label",
            }

        Note that you can recover the program for an example by using the index:

        .. code-block:: python

            index = item["index"][0]  #  first item of batch
            program = clevr.questions[index]["program"]

        """
        q = self.questions[index]
        img_path = os.path.join(
            self.dir_dataset, "CLEVR_v1.0", "images", self.split, q["image_filename"]
        )
        # add image data
        target = torch.zeros(len(self.aid_to_ans))
        ans_id = self.ans_to_aid[q["answer"]]

        im = Image.open(img_path)
        if self.transform is not None:
            im = self.transform(im)
        item = {
            "index": index,
            "question": q["question"],
            "answer": q["answer"],
            "question_family_index": q["question_family_index"],
            "image_filename": q["image_filename"],
            "image_index": q["image_index"],
            "image": im,
            "label": torch.tensor(ans_id),
        }
        return item

    def __len__(self) -> int:
        return len(self.questions)