Source code for multimodal.datasets.vqa

# stdlib
import itertools
import os
import json
from itertools import combinations
from statistics import mean
from collections import Counter
from typing import List
import pickle

# librairies
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
from torchtext.data.utils import get_tokenizer

# own
from multimodal.features import get_features
from multimodal.datasets import vqa_utils
from multimodal import DEFAULT_DATA_DIR
from multimodal.utils import download_and_unzip, download_file
from multimodal.datasets.vqa_utils import EvalAIAnswerProcessor


class AbstractVQA(Dataset):
    def get_all_tokens(self) -> List:
        raise NotImplementedError()

    def evaluate(self, predictions) -> float:
        raise NotImplementedError()


[docs]class VQA(AbstractVQA):
    """
    Pytorch Dataset implementation for the VQA v1 dataset (visual question answering). 
    See https://visualqa.org/ for more details about it.
    
    When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
    Pre-processing of questions and answers will take several minutes.

    When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be 
    necessary for downloading and extracting features.

    Args:
        dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
        features (str|object): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
            You can also give directly the feature instance.
        split (str): Which t [``train``, ``val``, ``test``]
        dir_features (str): directory to download features. If None, defaults to $dir_data/features
        label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
            given by the score they are assigned by the VQA evaluation. 
            If `best`, GT is the label of the top answer.
        tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
            The tokens are stored in item["question_tokens"].
        load (bool): default `True`. If false, then the questions annotations and questions will not be loaded
            in memory. This is useful if you want only to download and process the data.
    """

    SPLITS = ["train", "val", "test", "test-dev"]

    UNZIP = True

    name = "vqa"

    url_questions = {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Train_mscoco.zip",
        "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Val_mscoco.zip",
        "test": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Test_mscoco.zip",
    }
    url_annotations = {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Annotations_Train_mscoco.zip",
        "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Annotations_Val_mscoco.zip",
    }

    filename_questions = {
        "train": "OpenEnded_mscoco_train2014_questions.json",
        "val": "OpenEnded_mscoco_val2014_questions.json",
        "test": "OpenEnded_mscoco_test2015_questions.json",
        "test-dev": "OpenEnded_mscoco_test-dev2015_questions.json",
    }

    filename_annotations = {
        "train": "mscoco_train2014_annotations.json",
        "val": "mscoco_val2014_annotations.json",
    }

    def __init__(
        self,
        dir_data=None,
        features=None,
        split="train",
        min_ans_occ=9,
        dir_features=None,
        label="multilabel",
        tokenize_questions=False,
        load=True,
    ):
        self.dir_data = dir_data
        if self.dir_data is None:
            self.dir_data = DEFAULT_DATA_DIR
        self.split = split
        self.features = features
        self.dir_features = dir_features or os.path.join(self.dir_data)
        self.label = label
        self.min_ans_occ = min_ans_occ
        self.tokenize_questions = tokenize_questions
        if self.tokenize_questions:
            self.tokenizer = get_tokenizer("basic_english")

        # Test split has no annotations.
        self.has_annotations = self.split in self.url_annotations

        self.dir_dataset = os.path.join(self.dir_data, "datasets", self.name)

        self.dir_splits = {
            split: os.path.join(self.dir_dataset, split)
            for split in self.filename_questions.keys()
        }  # vqa2/train/

        self.dir_splits["test-dev"] = self.dir_splits["test"]  # same directory

        for s in self.dir_splits:
            os.makedirs(self.dir_splits[s], exist_ok=True)

        # path download question
        self.path_questions = {
            split: os.path.join(self.dir_splits[split], self.filename_questions[split])
            for split in self.filename_questions.keys()
        }  # vqa2/train/OpenEnded_mscoco_train2014_questions.json

        # path download annotations
        self.path_original_annotations = {
            split: os.path.join(
                self.dir_splits[split], self.filename_annotations[split]
            )
            for split in self.filename_annotations.keys()
        }  # vqa2/val/mscoco_val2014_annotations.json

        # processed annotations contain answer_token and answer scores
        self.processed_dirs = {
            split: os.path.join(self.dir_splits[split], "processed")
            for split in self.filename_annotations.keys()
        }

        self.path_annotations_processed = {
            split: os.path.join(self.processed_dirs[split], "annotations.json")
            for split in self.processed_dirs
        }
        for k, d in self.processed_dirs.items():
            os.makedirs(d, exist_ok=True)

        self.path_answers = os.path.join(
            self.dir_dataset, f"aid_to_ans-{self.min_ans_occ}.json"
        )

        self._download()
        self._process_annotations()

        if self.features is not None and type(features) == str:
            self._load_features()
        elif self.features is not None and isinstance(self.features, type):
            # object is given, do nothing.
            pass

        if load:
            self._load()  # load questions and annotations

            if self.has_annotations:
                # This dictionnary will be used for evaluation
                self.qid_to_annot = {a["question_id"]: a for a in self.annotations}

            # aid_to_ans
            self.ans_to_aid = {ans: i for i, ans in enumerate(self.answers)}

    @classmethod
    def download_and_process(cls, dir_data):
        cls(dir_data=dir_data, split="train", load="False")

    def _load_questions(self, split):
        with open(self.path_questions[split]) as f:
            return json.load(f)["questions"]

    def _load_original_annotations(self, split):
        with open(self.path_original_annotations[split]) as f:
            return json.load(f)["annotations"]

    def _load_processed_annotations(self, split):
        with open(self.path_annotations_processed[split]) as f:
            return json.load(f)

    def _load_features(self):
        if self.split == "test":
            self.feats = get_features(
                self.features, split="test2015", dir_data=self.dir_features,
            )
        else:
            self.feats = get_features(
                self.features, split="trainval2014", dir_data=self.dir_features
            )

    def get_all_tokens(self):
        tokenizer = get_tokenizer("basic_english")
        return list(
            set((token for q in self.questions for token in tokenizer(q["question"])))
        )

    def get_all_questions(self):
        return (q["question"] for q in self.questions)

    def _process_annotations(self):
        """Process answers to create answer tokens,
        and precompute VQA score for faster evaluation.
        This follows the official VQA evaluation tool.
        """
        paths = [self.path_annotations_processed[split] for split in self.url_annotations]
        # path_train = self.path_annotations_processed["train"]
        # path_val = self.path_annotations_processed["val"]
        if any(not os.path.exists(p) for p in paths):
            annotations = [self._load_original_annotations(split) for split in self.url_annotations]
            all_annotations = list(itertools.chain(*annotations))
            # annotations_train = self._load_original_annotations("train")
            # annotations_val = self._load_original_annotations("val")
            # all_annotations = annotations_train + annotations_val

            print("Processing annotations")
            processor = EvalAIAnswerProcessor()

            print("\tPre-Processing answer punctuation")
            for annot in tqdm(all_annotations):
                if "multiple_choice_answer" in annot:
                    annot["multiple_choice_answer"] = processor(
                        annot["multiple_choice_answer"]
                    )
                # vqa_utils.processPunctuation(
                #     annot["multiple_choice_answer"]
                # )
                for ansDic in annot["answers"]:
                    ansDic["answer"] = processor(ansDic["answer"])
            qid_to_scores = dict()
            print("\tPre-Computing answer scores")
            for annot in tqdm(all_annotations):
                annot["scores"] = {}
                unique_answers = set([a["answer"] for a in annot["answers"]])
                for ans in unique_answers:
                    scores = []
                    # score is average of 9/10 answers
                    for items in combinations(annot["answers"], 9):
                        matching_ans = [item for item in items if item["answer"] == ans]
                        score = min(1, float(len(matching_ans)) / 3)
                        scores.append(score)
                    annot["scores"][ans] = mean(scores)
                qid_to_scores[annot["question_id"]] = annot["scores"]

            for i, split in enumerate(self.url_annotations):
                print(f"Saving processed annotations for split {split} at path {self.path_annotations_processed[split]}")
                with open(self.path_annotations_processed[split], "w") as f:
                    json.dump(annotations[i], f)

            with open(os.path.join(self.dir_dataset, "qid_to_scores.json"), "w") as f:
                json.dump(qid_to_scores, f)

        #####################################
        # Processing min occurences of answer
        #####################################
        annotations = [self._load_processed_annotations(split) for split in self.url_annotations]
        all_annotations = itertools.chain(*annotations)
        if not os.path.exists(self.path_answers):
            if "multiple_choice_answer" in annotations[0][0]:
                print(f"Removing uncommon answers")
                occ = Counter(annot["multiple_choice_answer"] for annot in all_annotations)
                self.answers = [ans for ans in occ if occ[ans] >= self.min_ans_occ]
                print(
                    f"Num answers after keeping occ >= {self.min_ans_occ}: {len(self.answers)}."
                )
                print(f"Saving answers at {self.path_answers}")
            else:
                self.answers = list(set(ans["answer"] for a in all_annotations for ans in a["answers"]))
            
            with open(self.path_answers, "w") as f:
                json.dump(self.answers, f)
        else:
            with open(self.path_answers) as f:
                self.answers = json.load(f)

    def _load(self):
        print("Loading questions")
        with open(self.path_questions[self.split]) as f:
            self.questions = json.load(f)["questions"]

        print("Loading annotations")
        if self.has_annotations:
            with open(self.path_annotations_processed[self.split]) as f:
                self.annotations = json.load(f)

        print(f"Loading aid_to_ans")
        with open(self.path_answers) as f:
            self.answers = json.load(f)

    def _download(self):
        # download all splits
        for split in self.url_questions.keys():
            url_questions = self.url_questions[split]
            directory = self.dir_splits[split]
            path_questions = self.path_questions[split]
            if not os.path.exists(path_questions):
                print(f"Downloading questions at {url_questions} to {directory}")
                if self.UNZIP:
                    download_and_unzip(url_questions, directory=directory)
                else:
                    download_file(url_questions, directory=directory)

        for split in self.url_annotations.keys():
            url_annotations = self.url_annotations[split]
            directory = self.dir_splits[split]
            path_annotations = self.path_original_annotations[split]
            if not os.path.exists(path_annotations):
                print(f"Downloading annotations {url_annotations} to {directory}")
                if self.UNZIP:
                    download_and_unzip(url_annotations, directory=directory)
                else:
                    download_file(url_annotations, directory=directory)

    def __len__(self):
        """
        Returns the number of (question-image-answer) items in the dataset.
        """
        return len(self.questions)

[docs]    def __getitem__(self, index):
        """
        Returns a dictionnary with the following keys 

        .. code-block::

            {
                'image_id', 
                'question_id',
                'question', 
                'answer_type',
                'multiple_choice_answer',
                'answers',
                'image_id',
                'question_type',
                'question_id',
                'scores',
                'label'   # ground truth label to be used for the loss
            }

        Aditionnaly, if visual features are used, keys from the features will be added.
        """
        item = {"index": index}
        item.update(self.questions[index])
        if self.has_annotations:
            item.update(self.annotations[index])
            if self.label == "multilabel":
                label = torch.zeros(len(self.answers))
                for ans, score in self.annotations[index]["scores"].items():
                    if ans in self.ans_to_aid:
                        aid = self.ans_to_aid[ans]
                        label[aid] = score
                item["label"] = label
            elif self.label == "best":
                scores = self.annotations[index]["scores"]
                best_ans = max(scores, key=scores.get)
                ans_id = self.ans_to_aid[best_ans]
                item["label"] = torch.tensor(ans_id)

        if self.features is not None:
            image_id = item["image_id"]
            item.update(self.feats[image_id])

        if self.tokenize_questions:
            item["question_tokens"] = self.tokenizer(item["question"])
        return item

[docs]    @staticmethod
    def collate_fn(batch):
        """
        Use this method to collate batches of data.
        """
        no_collate_keys = ["scores", "question_id", "question"]
        result_batch = {}
        for key in batch[0]:
            if key not in no_collate_keys:
                result_batch[key] = default_collate([item[key] for item in batch])
            else:
                result_batch[key] = [item[key] for item in batch]
        return result_batch

[docs]    def evaluate(self, predictions):
        """
        Evaluates a list of predictions, according to the VQA evaluation protocol. See https://visualqa.org/evaluation.html.

        Args:
            predictions (list): List of dictionnaries containing ``question_id`` and ``answer`` keys. The answer must be specified 
                as a string.

        Returns:
            A dict of floats containing scores for "overall", "yes/no", number", and "other" questions.
        """
        scores = {"overall": [], "yes/no": [], "number": [], "other": []}
        for p in predictions:
            qid = p["question_id"]
            ans = p["answer"]
            annot = self.qid_to_annot[qid]
            score = annot["scores"].get(ans, 0.0)  # default score is 0
            ans_type = annot["answer_type"]
            scores["overall"].append(score)
            scores[ans_type].append(score)
        return {
            key: mean(score_list) if len(score_list) else 0.0
            for key, score_list in scores.items()
        }


[docs]class VQA2(VQA):
    """
    Pytorch Dataset implementation for the VQA v2 dataset (visual question answering). 
    See https://visualqa.org/ for more details about it.
    
    When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
    Pre-processing of questions and answers will take several minutes.

    When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be 
    necessary for downloading and extracting features.

    Args:
        dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
        features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
        split (str): Which t [``train``, ``val``, ``test``]
        dir_features (str): directory to download features. If None, defaults to $dir_data/features
        label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
            given by the score they are assigned by the VQA evaluation. 
            If `best`, GT is the label of the top answer.
        tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
            The tokens are stored in item["question_tokens"].
    """

    name = "vqa2"

    url_questions = {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip",
        "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip",
        "test": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip",
    }
    url_annotations = {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip",
        "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip",
    }

    filename_questions = {
        "train": "v2_OpenEnded_mscoco_train2014_questions.json",
        "val": "v2_OpenEnded_mscoco_val2014_questions.json",
        "test": "v2_OpenEnded_mscoco_test2015_questions.json",
        "test-dev": "v2_OpenEnded_mscoco_test-dev2015_questions.json",
    }

    filename_annotations = {
        "train": "v2_mscoco_train2014_annotations.json",
        "val": "v2_mscoco_val2014_annotations.json",
    }


[docs]class VQACP(VQA):
    """  Pytorch Dataset implementation for the VQA-CP v1 dataset (visual question answering). 
    See https://www.cc.gatech.edu/grads/a/aagrawal307/vqa-cp/ for more details about it.
    
    When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
    Pre-processing of questions and answers will take several minutes.

    When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be 
    necessary for downloading and extracting features.

    Args:
        dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
        features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
        split (str): Which t [``train``, ``val``, ``test``]
        dir_features (str): directory to download features. If None, defaults to $dir_data/features
        label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
            given by the score they are assigned by the VQA evaluation. 
            If `best`, GT is the label of the top answer.
        tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
            The tokens are stored in item["question_tokens"].
    """

    DOWNLOAD_SPLITS = ["train", "test"]
    UNZIP = False

    name = "vqacp"

    url_questions = {
        "train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_questions.json",
        "test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_questions.json",
    }

    url_annotations = {
        "train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_annotations.json",
        "test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_annotations.json",
    }

    filename_questions = {
        "train": "vqacp_v1_train_questions.json",
        "test": "vqacp_v1_test_questions.json",
    }

    filename_annotations = {
        "train": "vqacp_v1_train_annotations.json",
        "test": "vqacp_v1_test_annotations.json",
    }


    def _load_questions(self, split):
        with open(self.path_questions[split]) as f:
            return json.load(f)

    def _load_original_annotations(self, split):
        with open(self.path_original_annotations[split]) as f:
            return json.load(f)


    def _load(self):
        print("Loading questions")
        with open(self.path_questions[self.split]) as f:
            self.questions = json.load(f)

        print("Loading annotations")
        if self.has_annotations:
            with open(self.path_annotations_processed[self.split]) as f:
                self.annotations = json.load(f)

        print(f"Loading aid_to_ans")
        with open(self.path_answers) as f:
            self.answers = json.load(f)

[docs]class VQACP2(VQACP):
    """  Pytorch Dataset implementation for the VQA-CP v2 dataset (visual question answering). 
    See https://www.cc.gatech.edu/grads/a/aagrawal307/vqa-cp/ for more details about it.
    
    When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
    Pre-processing of questions and answers will take several minutes.

    When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be 
    necessary for downloading and extracting features.

    Args:
        dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
        features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
        split (str): Which t [``train``, ``val``, ``test``]
        dir_features (str): directory to download features. If None, defaults to $dir_data/features
        label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
            given by the score they are assigned by the VQA evaluation. 
            If `best`, GT is the label of the top answer.
        tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
            The tokens are stored in item["question_tokens"].
    """

    name = "vqacp2"

    url_questions = {
        "train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_questions.json",
        "test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_questions.json",
    }

    url_annotations = {
        "train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_annotations.json",
        "test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_annotations.json",
    }

    filename_questions = {
        "train": "vqacp_v2_train_questions.json",
        "test": "vqacp_v2_test_questions.json",
    }

    filename_annotations = {
        "train": "vqacp_v2_train_annotations.json",
        "test": "vqacp_v2_test_annotations.json",
    }

class AdVQA(VQA):
    UNZIP = False
    name = "AdVQA"

    url_questions = {
        "val": "https://dl.fbaipublicfiles.com/advqa/v1_OpenEnded_mscoco_val2017_advqa_questions.json",
        "test": "https://dl.fbaipublicfiles.com/advqa/v1_OpenEnded_mscoco_testdev2015_advqa_questions.json",
    }

    url_annotations = {
        "val": "https://dl.fbaipublicfiles.com/advqa/v1_mscoco_val2017_advqa_annotations.json",
    }

    filename_questions = {
        "val": "v1_OpenEnded_mscoco_val2017_advqa_questions.json",
        "test": "v1_OpenEnded_mscoco_testdev2015_advqa_questions.json",
    }
    filename_annotations = {
        "val": "v1_mscoco_val2017_advqa_annotations.json",
    }