# stdlib
import itertools
import os
import json
from itertools import combinations
from statistics import mean
from collections import Counter
from typing import List
import pickle
# librairies
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
from torchtext.data.utils import get_tokenizer
# own
from multimodal.features import get_features
from multimodal.datasets import vqa_utils
from multimodal import DEFAULT_DATA_DIR
from multimodal.utils import download_and_unzip, download_file
from multimodal.datasets.vqa_utils import EvalAIAnswerProcessor
class AbstractVQA(Dataset):
def get_all_tokens(self) -> List:
raise NotImplementedError()
def evaluate(self, predictions) -> float:
raise NotImplementedError()
[docs]class VQA(AbstractVQA):
"""
Pytorch Dataset implementation for the VQA v1 dataset (visual question answering).
See https://visualqa.org/ for more details about it.
When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
Pre-processing of questions and answers will take several minutes.
When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be
necessary for downloading and extracting features.
Args:
dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
features (str|object): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
You can also give directly the feature instance.
split (str): Which t [``train``, ``val``, ``test``]
dir_features (str): directory to download features. If None, defaults to $dir_data/features
label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
given by the score they are assigned by the VQA evaluation.
If `best`, GT is the label of the top answer.
tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
The tokens are stored in item["question_tokens"].
load (bool): default `True`. If false, then the questions annotations and questions will not be loaded
in memory. This is useful if you want only to download and process the data.
"""
SPLITS = ["train", "val", "test", "test-dev"]
UNZIP = True
name = "vqa"
url_questions = {
"train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Train_mscoco.zip",
"val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Val_mscoco.zip",
"test": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Questions_Test_mscoco.zip",
}
url_annotations = {
"train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Annotations_Train_mscoco.zip",
"val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/Annotations_Val_mscoco.zip",
}
filename_questions = {
"train": "OpenEnded_mscoco_train2014_questions.json",
"val": "OpenEnded_mscoco_val2014_questions.json",
"test": "OpenEnded_mscoco_test2015_questions.json",
"test-dev": "OpenEnded_mscoco_test-dev2015_questions.json",
}
filename_annotations = {
"train": "mscoco_train2014_annotations.json",
"val": "mscoco_val2014_annotations.json",
}
def __init__(
self,
dir_data=None,
features=None,
split="train",
min_ans_occ=9,
dir_features=None,
label="multilabel",
tokenize_questions=False,
load=True,
):
self.dir_data = dir_data
if self.dir_data is None:
self.dir_data = DEFAULT_DATA_DIR
self.split = split
self.features = features
self.dir_features = dir_features or os.path.join(self.dir_data)
self.label = label
self.min_ans_occ = min_ans_occ
self.tokenize_questions = tokenize_questions
if self.tokenize_questions:
self.tokenizer = get_tokenizer("basic_english")
# Test split has no annotations.
self.has_annotations = self.split in self.url_annotations
self.dir_dataset = os.path.join(self.dir_data, "datasets", self.name)
self.dir_splits = {
split: os.path.join(self.dir_dataset, split)
for split in self.filename_questions.keys()
} # vqa2/train/
self.dir_splits["test-dev"] = self.dir_splits["test"] # same directory
for s in self.dir_splits:
os.makedirs(self.dir_splits[s], exist_ok=True)
# path download question
self.path_questions = {
split: os.path.join(self.dir_splits[split], self.filename_questions[split])
for split in self.filename_questions.keys()
} # vqa2/train/OpenEnded_mscoco_train2014_questions.json
# path download annotations
self.path_original_annotations = {
split: os.path.join(
self.dir_splits[split], self.filename_annotations[split]
)
for split in self.filename_annotations.keys()
} # vqa2/val/mscoco_val2014_annotations.json
# processed annotations contain answer_token and answer scores
self.processed_dirs = {
split: os.path.join(self.dir_splits[split], "processed")
for split in self.filename_annotations.keys()
}
self.path_annotations_processed = {
split: os.path.join(self.processed_dirs[split], "annotations.json")
for split in self.processed_dirs
}
for k, d in self.processed_dirs.items():
os.makedirs(d, exist_ok=True)
self.path_answers = os.path.join(
self.dir_dataset, f"aid_to_ans-{self.min_ans_occ}.json"
)
self._download()
self._process_annotations()
if self.features is not None and type(features) == str:
self._load_features()
elif self.features is not None and isinstance(self.features, type):
# object is given, do nothing.
pass
if load:
self._load() # load questions and annotations
if self.has_annotations:
# This dictionnary will be used for evaluation
self.qid_to_annot = {a["question_id"]: a for a in self.annotations}
# aid_to_ans
self.ans_to_aid = {ans: i for i, ans in enumerate(self.answers)}
@classmethod
def download_and_process(cls, dir_data):
cls(dir_data=dir_data, split="train", load="False")
def _load_questions(self, split):
with open(self.path_questions[split]) as f:
return json.load(f)["questions"]
def _load_original_annotations(self, split):
with open(self.path_original_annotations[split]) as f:
return json.load(f)["annotations"]
def _load_processed_annotations(self, split):
with open(self.path_annotations_processed[split]) as f:
return json.load(f)
def _load_features(self):
if self.split == "test":
self.feats = get_features(
self.features, split="test2015", dir_data=self.dir_features,
)
else:
self.feats = get_features(
self.features, split="trainval2014", dir_data=self.dir_features
)
def get_all_tokens(self):
tokenizer = get_tokenizer("basic_english")
return list(
set((token for q in self.questions for token in tokenizer(q["question"])))
)
def get_all_questions(self):
return (q["question"] for q in self.questions)
def _process_annotations(self):
"""Process answers to create answer tokens,
and precompute VQA score for faster evaluation.
This follows the official VQA evaluation tool.
"""
paths = [self.path_annotations_processed[split] for split in self.url_annotations]
# path_train = self.path_annotations_processed["train"]
# path_val = self.path_annotations_processed["val"]
if any(not os.path.exists(p) for p in paths):
annotations = [self._load_original_annotations(split) for split in self.url_annotations]
all_annotations = list(itertools.chain(*annotations))
# annotations_train = self._load_original_annotations("train")
# annotations_val = self._load_original_annotations("val")
# all_annotations = annotations_train + annotations_val
print("Processing annotations")
processor = EvalAIAnswerProcessor()
print("\tPre-Processing answer punctuation")
for annot in tqdm(all_annotations):
if "multiple_choice_answer" in annot:
annot["multiple_choice_answer"] = processor(
annot["multiple_choice_answer"]
)
# vqa_utils.processPunctuation(
# annot["multiple_choice_answer"]
# )
for ansDic in annot["answers"]:
ansDic["answer"] = processor(ansDic["answer"])
qid_to_scores = dict()
print("\tPre-Computing answer scores")
for annot in tqdm(all_annotations):
annot["scores"] = {}
unique_answers = set([a["answer"] for a in annot["answers"]])
for ans in unique_answers:
scores = []
# score is average of 9/10 answers
for items in combinations(annot["answers"], 9):
matching_ans = [item for item in items if item["answer"] == ans]
score = min(1, float(len(matching_ans)) / 3)
scores.append(score)
annot["scores"][ans] = mean(scores)
qid_to_scores[annot["question_id"]] = annot["scores"]
for i, split in enumerate(self.url_annotations):
print(f"Saving processed annotations for split {split} at path {self.path_annotations_processed[split]}")
with open(self.path_annotations_processed[split], "w") as f:
json.dump(annotations[i], f)
with open(os.path.join(self.dir_dataset, "qid_to_scores.json"), "w") as f:
json.dump(qid_to_scores, f)
#####################################
# Processing min occurences of answer
#####################################
annotations = [self._load_processed_annotations(split) for split in self.url_annotations]
all_annotations = itertools.chain(*annotations)
if not os.path.exists(self.path_answers):
if "multiple_choice_answer" in annotations[0][0]:
print(f"Removing uncommon answers")
occ = Counter(annot["multiple_choice_answer"] for annot in all_annotations)
self.answers = [ans for ans in occ if occ[ans] >= self.min_ans_occ]
print(
f"Num answers after keeping occ >= {self.min_ans_occ}: {len(self.answers)}."
)
print(f"Saving answers at {self.path_answers}")
else:
self.answers = list(set(ans["answer"] for a in all_annotations for ans in a["answers"]))
with open(self.path_answers, "w") as f:
json.dump(self.answers, f)
else:
with open(self.path_answers) as f:
self.answers = json.load(f)
def _load(self):
print("Loading questions")
with open(self.path_questions[self.split]) as f:
self.questions = json.load(f)["questions"]
print("Loading annotations")
if self.has_annotations:
with open(self.path_annotations_processed[self.split]) as f:
self.annotations = json.load(f)
print(f"Loading aid_to_ans")
with open(self.path_answers) as f:
self.answers = json.load(f)
def _download(self):
# download all splits
for split in self.url_questions.keys():
url_questions = self.url_questions[split]
directory = self.dir_splits[split]
path_questions = self.path_questions[split]
if not os.path.exists(path_questions):
print(f"Downloading questions at {url_questions} to {directory}")
if self.UNZIP:
download_and_unzip(url_questions, directory=directory)
else:
download_file(url_questions, directory=directory)
for split in self.url_annotations.keys():
url_annotations = self.url_annotations[split]
directory = self.dir_splits[split]
path_annotations = self.path_original_annotations[split]
if not os.path.exists(path_annotations):
print(f"Downloading annotations {url_annotations} to {directory}")
if self.UNZIP:
download_and_unzip(url_annotations, directory=directory)
else:
download_file(url_annotations, directory=directory)
def __len__(self):
"""
Returns the number of (question-image-answer) items in the dataset.
"""
return len(self.questions)
[docs] def __getitem__(self, index):
"""
Returns a dictionnary with the following keys
.. code-block::
{
'image_id',
'question_id',
'question',
'answer_type',
'multiple_choice_answer',
'answers',
'image_id',
'question_type',
'question_id',
'scores',
'label' # ground truth label to be used for the loss
}
Aditionnaly, if visual features are used, keys from the features will be added.
"""
item = {"index": index}
item.update(self.questions[index])
if self.has_annotations:
item.update(self.annotations[index])
if self.label == "multilabel":
label = torch.zeros(len(self.answers))
for ans, score in self.annotations[index]["scores"].items():
if ans in self.ans_to_aid:
aid = self.ans_to_aid[ans]
label[aid] = score
item["label"] = label
elif self.label == "best":
scores = self.annotations[index]["scores"]
best_ans = max(scores, key=scores.get)
ans_id = self.ans_to_aid[best_ans]
item["label"] = torch.tensor(ans_id)
if self.features is not None:
image_id = item["image_id"]
item.update(self.feats[image_id])
if self.tokenize_questions:
item["question_tokens"] = self.tokenizer(item["question"])
return item
[docs] @staticmethod
def collate_fn(batch):
"""
Use this method to collate batches of data.
"""
no_collate_keys = ["scores", "question_id", "question"]
result_batch = {}
for key in batch[0]:
if key not in no_collate_keys:
result_batch[key] = default_collate([item[key] for item in batch])
else:
result_batch[key] = [item[key] for item in batch]
return result_batch
[docs] def evaluate(self, predictions):
"""
Evaluates a list of predictions, according to the VQA evaluation protocol. See https://visualqa.org/evaluation.html.
Args:
predictions (list): List of dictionnaries containing ``question_id`` and ``answer`` keys. The answer must be specified
as a string.
Returns:
A dict of floats containing scores for "overall", "yes/no", number", and "other" questions.
"""
scores = {"overall": [], "yes/no": [], "number": [], "other": []}
for p in predictions:
qid = p["question_id"]
ans = p["answer"]
annot = self.qid_to_annot[qid]
score = annot["scores"].get(ans, 0.0) # default score is 0
ans_type = annot["answer_type"]
scores["overall"].append(score)
scores[ans_type].append(score)
return {
key: mean(score_list) if len(score_list) else 0.0
for key, score_list in scores.items()
}
[docs]class VQA2(VQA):
"""
Pytorch Dataset implementation for the VQA v2 dataset (visual question answering).
See https://visualqa.org/ for more details about it.
When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
Pre-processing of questions and answers will take several minutes.
When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be
necessary for downloading and extracting features.
Args:
dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
split (str): Which t [``train``, ``val``, ``test``]
dir_features (str): directory to download features. If None, defaults to $dir_data/features
label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
given by the score they are assigned by the VQA evaluation.
If `best`, GT is the label of the top answer.
tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
The tokens are stored in item["question_tokens"].
"""
name = "vqa2"
url_questions = {
"train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip",
"val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip",
"test": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip",
}
url_annotations = {
"train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip",
"val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip",
}
filename_questions = {
"train": "v2_OpenEnded_mscoco_train2014_questions.json",
"val": "v2_OpenEnded_mscoco_val2014_questions.json",
"test": "v2_OpenEnded_mscoco_test2015_questions.json",
"test-dev": "v2_OpenEnded_mscoco_test-dev2015_questions.json",
}
filename_annotations = {
"train": "v2_mscoco_train2014_annotations.json",
"val": "v2_mscoco_val2014_annotations.json",
}
[docs]class VQACP(VQA):
""" Pytorch Dataset implementation for the VQA-CP v1 dataset (visual question answering).
See https://www.cc.gatech.edu/grads/a/aagrawal307/vqa-cp/ for more details about it.
When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
Pre-processing of questions and answers will take several minutes.
When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be
necessary for downloading and extracting features.
Args:
dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
split (str): Which t [``train``, ``val``, ``test``]
dir_features (str): directory to download features. If None, defaults to $dir_data/features
label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
given by the score they are assigned by the VQA evaluation.
If `best`, GT is the label of the top answer.
tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
The tokens are stored in item["question_tokens"].
"""
DOWNLOAD_SPLITS = ["train", "test"]
UNZIP = False
name = "vqacp"
url_questions = {
"train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_questions.json",
"test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_questions.json",
}
url_annotations = {
"train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_annotations.json",
"test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_annotations.json",
}
filename_questions = {
"train": "vqacp_v1_train_questions.json",
"test": "vqacp_v1_test_questions.json",
}
filename_annotations = {
"train": "vqacp_v1_train_annotations.json",
"test": "vqacp_v1_test_annotations.json",
}
def _load_questions(self, split):
with open(self.path_questions[split]) as f:
return json.load(f)
def _load_original_annotations(self, split):
with open(self.path_original_annotations[split]) as f:
return json.load(f)
def _load(self):
print("Loading questions")
with open(self.path_questions[self.split]) as f:
self.questions = json.load(f)
print("Loading annotations")
if self.has_annotations:
with open(self.path_annotations_processed[self.split]) as f:
self.annotations = json.load(f)
print(f"Loading aid_to_ans")
with open(self.path_answers) as f:
self.answers = json.load(f)
[docs]class VQACP2(VQACP):
""" Pytorch Dataset implementation for the VQA-CP v2 dataset (visual question answering).
See https://www.cc.gatech.edu/grads/a/aagrawal307/vqa-cp/ for more details about it.
When this class is instanciated, data will be downloaded in the directory specified by the ``dir_data`` parameter.
Pre-processing of questions and answers will take several minutes.
When the ``features`` argument is specified, visual features will be downloaded as well. About 60Go will be
necessary for downloading and extracting features.
Args:
dir_data (str): dir for the multimodal cache (data will be downloaded in a vqa2/ folder inside this directory
features (str): which visual features should be used. Choices: ``coco-bottomup`` or ``coco-bottomup-36``
split (str): Which t [``train``, ``val``, ``test``]
dir_features (str): directory to download features. If None, defaults to $dir_data/features
label (str): either `multilabel`, or `best`. For `multilabel`, GT scores for questions are
given by the score they are assigned by the VQA evaluation.
If `best`, GT is the label of the top answer.
tokenize_questions (bool): If True, preprocessing will tokenize questions into tokens.
The tokens are stored in item["question_tokens"].
"""
name = "vqacp2"
url_questions = {
"train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_questions.json",
"test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_questions.json",
}
url_annotations = {
"train": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_annotations.json",
"test": "https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_annotations.json",
}
filename_questions = {
"train": "vqacp_v2_train_questions.json",
"test": "vqacp_v2_test_questions.json",
}
filename_annotations = {
"train": "vqacp_v2_train_annotations.json",
"test": "vqacp_v2_test_annotations.json",
}
class AdVQA(VQA):
UNZIP = False
name = "AdVQA"
url_questions = {
"val": "https://dl.fbaipublicfiles.com/advqa/v1_OpenEnded_mscoco_val2017_advqa_questions.json",
"test": "https://dl.fbaipublicfiles.com/advqa/v1_OpenEnded_mscoco_testdev2015_advqa_questions.json",
}
url_annotations = {
"val": "https://dl.fbaipublicfiles.com/advqa/v1_mscoco_val2017_advqa_annotations.json",
}
filename_questions = {
"val": "v1_OpenEnded_mscoco_val2017_advqa_questions.json",
"test": "v1_OpenEnded_mscoco_testdev2015_advqa_questions.json",
}
filename_annotations = {
"val": "v1_mscoco_val2017_advqa_annotations.json",
}