[SYSTEMDS-3835] Add Modality Data Type
This patch adds a data type for all modalities.
Closes #2270.
diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index 4b21853..1c3cfe9 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -39,7 +39,7 @@
from systemds.scuro.representations.max import RowMax
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.mfcc import MFCC
-from systemds.scuro.representations.multiplication import Multiplication
+from systemds.scuro.representations.hadamard import Hadamard
from systemds.scuro.representations.optical_flow import OpticalFlow
from systemds.scuro.representations.representation import Representation
from systemds.scuro.representations.representation_dataloader import NPY
@@ -52,7 +52,7 @@
from systemds.scuro.representations.tfidf import TfIdf
from systemds.scuro.representations.unimodal import UnimodalRepresentation
from systemds.scuro.representations.wav2vec import Wav2Vec
-from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.representations.window_aggregation import WindowAggregation
from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.representations.x3d import X3D
from systemds.scuro.models.model import Model
@@ -94,7 +94,7 @@
"RowMax",
"MelSpectrogram",
"MFCC",
- "Multiplication",
+ "Hadamard",
"OpticalFlow",
"Representation",
"NPY",
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index a008962..a1dad30 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -21,6 +21,8 @@
from typing import List, Optional, Union
import librosa
+import numpy as np
+
from systemds.scuro.dataloader.base_loader import BaseLoader
from systemds.scuro.modality.type import ModalityType
@@ -30,15 +32,27 @@
self,
source_path: str,
indices: List[str],
+ data_type: Union[np.dtype, str] = np.float32,
chunk_size: Optional[int] = None,
normalize: bool = True,
+ load=True,
):
- super().__init__(source_path, indices, chunk_size, ModalityType.AUDIO)
+ super().__init__(
+ source_path, indices, data_type, chunk_size, ModalityType.AUDIO
+ )
self.normalize = normalize
+ self.load_data_from_file = load
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
self.file_sanity_check(file)
- audio, sr = librosa.load(file)
+ # if not self.load_data_from_file:
+ # import numpy as np
+ #
+ # self.metadata[file] = self.modality_type.create_audio_metadata(
+ # 1000, np.array([0])
+ # )
+ # else:
+ audio, sr = librosa.load(file, dtype=self._data_type)
if self.normalize:
audio = librosa.util.normalize(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index ea2b25b..f21f212 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -21,6 +21,9 @@
import os
from abc import ABC, abstractmethod
from typing import List, Optional, Union
+import math
+
+import numpy as np
class BaseLoader(ABC):
@@ -28,6 +31,7 @@
self,
source_path: str,
indices: List[str],
+ data_type: Union[np.dtype, str],
chunk_size: Optional[int] = None,
modality_type=None,
):
@@ -48,6 +52,7 @@
self._next_chunk = 0
self._num_chunks = 1
self._chunk_size = None
+ self._data_type = data_type
if chunk_size:
self.chunk_size = chunk_size
@@ -59,7 +64,7 @@
@chunk_size.setter
def chunk_size(self, value):
self._chunk_size = value
- self._num_chunks = int(len(self.indices) / self._chunk_size)
+ self._num_chunks = int(math.ceil(len(self.indices) / self._chunk_size))
@property
def num_chunks(self):
@@ -69,6 +74,14 @@
def next_chunk(self):
return self._next_chunk
+ @property
+ def data_type(self):
+ return self._data_type
+
+ @data_type.setter
+ def data_type(self, data_type):
+ self._data_type = self.resolve_data_type(data_type)
+
def reset(self):
self._next_chunk = 0
self.data = []
@@ -110,17 +123,26 @@
return self._load(next_chunk_indices)
def _load(self, indices: List[str]):
- is_dir = True if os.path.isdir(self.source_path) else False
-
- if is_dir:
- _, ext = os.path.splitext(os.listdir(self.source_path)[0])
- for index in indices:
- self.extract(self.source_path + index + ext)
+ file_names = self.get_file_names(indices)
+ if isinstance(file_names, str):
+ self.extract(file_names, indices)
else:
- self.extract(self.source_path, indices)
+ for file_name in file_names:
+ self.extract(file_name)
return self.data, self.metadata
+ def get_file_names(self, indices=None):
+ is_dir = True if os.path.isdir(self.source_path) else False
+ file_names = []
+ if is_dir:
+ _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+ for index in self.indices if indices is None else indices:
+ file_names.append(self.source_path + index + ext)
+ return file_names
+ else:
+ return self.source_path
+
@abstractmethod
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
pass
@@ -137,3 +159,30 @@
if file_size == 0:
raise ("File {0} is empty".format(file))
+
+ @staticmethod
+ def resolve_data_type(data_type):
+ if isinstance(data_type, str):
+ if data_type.lower() in [
+ "float16",
+ "float32",
+ "float64",
+ "int16",
+ "int32",
+ "int64",
+ ]:
+ return np.dtype(data_type)
+ else:
+ raise ValueError(f"Unsupported data_type string: {data_type}")
+ elif data_type in [
+ np.float16,
+ np.float32,
+ np.float64,
+ np.int16,
+ np.int32,
+ np.int64,
+ str,
+ ]:
+ return data_type
+ else:
+ raise ValueError(f"Unsupported data_type: {data_type}")
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index edef7f2..a355edd 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -20,6 +20,8 @@
# -------------------------------------------------------------
import json
+import numpy as np
+
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.dataloader.base_loader import BaseLoader
from typing import Optional, List, Union
@@ -31,9 +33,10 @@
source_path: str,
indices: List[str],
field: str,
+ data_type: Union[np.dtype, str] = str,
chunk_size: Optional[int] = None,
):
- super().__init__(source_path, indices, chunk_size, ModalityType.TEXT)
+ super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
self.field = field
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
index 3f87155..6689fb6 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -29,10 +29,11 @@
self,
source_path: str,
indices: List[str],
+ data_type: str = str,
chunk_size: Optional[int] = None,
prefix: Optional[Pattern[str]] = None,
):
- super().__init__(source_path, indices, chunk_size, ModalityType.TEXT)
+ super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
self.prefix = prefix
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 333960e..96ea5f1 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -32,12 +32,22 @@
self,
source_path: str,
indices: List[str],
+ data_type: Union[np.dtype, str] = np.float16,
chunk_size: Optional[int] = None,
+ load=True,
):
- super().__init__(source_path, indices, chunk_size, ModalityType.VIDEO)
+ super().__init__(
+ source_path, indices, data_type, chunk_size, ModalityType.VIDEO
+ )
+ self.load_data_from_file = load
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
self.file_sanity_check(file)
+ # if not self.load_data_from_file:
+ # self.metadata[file] = self.modality_type.create_video_metadata(
+ # 30, 10, 100, 100, 3
+ # )
+ # else:
cap = cv2.VideoCapture(file)
if not cap.isOpened():
@@ -60,8 +70,8 @@
if not ret:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
- frame = frame.astype(np.float32) / 255.0
+ frame = frame.astype(self._data_type) / 255.0
frames.append(frame)
- self.data.append(frames)
+ self.data.append(np.stack(frames))
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index 942e5bb..cfd313e 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -58,6 +58,7 @@
return self._representations[modality]
def get_context_operators(self):
+ # TODO: return modality specific context operations
return self._context_operators
def get_fusion_operators(self):
diff --git a/src/main/python/systemds/scuro/drsearch/representation_cache.py b/src/main/python/systemds/scuro/drsearch/representation_cache.py
index fc78167..4df4782 100644
--- a/src/main/python/systemds/scuro/drsearch/representation_cache.py
+++ b/src/main/python/systemds/scuro/drsearch/representation_cache.py
@@ -112,7 +112,8 @@
metadata = pickle.load(f)
transformed_modality = TransformedModality(
- modality.modality_type, op_names, modality.modality_id, metadata
+ modality,
+ op_names,
)
data = None
with open(f"{filename}.pkl", "rb") as f:
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
index a0ab8c4..6c6190e 100644
--- a/src/main/python/systemds/scuro/modality/joined_transformed.py
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -25,7 +25,7 @@
from systemds.scuro.modality.modality import Modality
from systemds.scuro.representations.utils import pad_sequences
-from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.representations.window_aggregation import WindowAggregation
class JoinedTransformedModality(Modality):
@@ -70,7 +70,7 @@
self.data = pad_sequences(self.data)
return self
- def window(self, window_size, aggregation):
+ def window_aggregation(self, window_size, aggregation):
w = WindowAggregation(window_size, aggregation)
self.left_modality.data = w.execute(self.left_modality)
self.right_modality.data = w.execute(self.right_modality)
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index c16db00..87d5b5e 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -29,7 +29,9 @@
class Modality:
- def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}):
+ def __init__(
+ self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None
+ ):
"""
Parent class of the different Modalities (unimodal & multimodal)
:param modality_type: Type of the modality
@@ -38,7 +40,7 @@
self.schema = modalityType.get_schema()
self.metadata = metadata
self.data = []
- self.data_type = None
+ self.data_type = data_type
self.cost = None
self.shape = None
self.modality_id = modality_id
@@ -67,7 +69,9 @@
"""
Create a copy of the modality instance
"""
- return type(self)(self.modality_type, self.metadata)
+ return type(self)(
+ self.modality_type, self.modality_id, self.metadata, self.data_type
+ )
def update_metadata(self):
"""
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index aba59c1..362764d 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -24,24 +24,28 @@
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.modality.joined import JoinedModality
from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.representations.window_aggregation import WindowAggregation
class TransformedModality(Modality):
- def __init__(self, modality_type, transformation, modality_id, metadata):
+ def __init__(self, modality, transformation, new_modality_type=None):
"""
Parent class of the different Modalities (unimodal & multimodal)
:param modality_type: Type of the original modality(ies)
:param transformation: Representation to be applied on the modality
"""
- super().__init__(modality_type, modality_id, metadata)
+ if new_modality_type is None:
+ new_modality_type = modality.modality_type
+
+ metadata = modality.metadata.copy() if modality.metadata is not None else None
+ super().__init__(
+ new_modality_type, modality.modality_id, metadata, modality.data_type
+ )
self.transformation = transformation
def copy_from_instance(self):
- return type(self)(
- self.modality_type, self.transformation, self.modality_id, self.metadata
- )
+ return type(self)(self, self.transformation)
def join(self, right, join_condition):
chunked_execution = False
@@ -65,19 +69,15 @@
return joined_modality
- def window(self, windowSize, aggregation):
- transformed_modality = TransformedModality(
- self.modality_type, "window", self.modality_id, self.metadata
- )
+ def window_aggregation(self, windowSize, aggregation):
w = WindowAggregation(windowSize, aggregation)
+ transformed_modality = TransformedModality(self, w)
transformed_modality.data = w.execute(self)
return transformed_modality
def context(self, context_operator):
- transformed_modality = TransformedModality(
- self.modality_type, context_operator.name, self.modality_id, self.metadata
- )
+ transformed_modality = TransformedModality(self, context_operator)
transformed_modality.data = context_operator.execute(self)
return transformed_modality
@@ -94,10 +94,7 @@
:param fusion_method: The fusion method to be used to combine modalities
"""
fused_modality = TransformedModality(
- ModalityType.EMBEDDING,
- fusion_method,
- self.modality_id,
- self.metadata,
+ self, fusion_method, ModalityType.EMBEDDING
)
modalities = [self]
if isinstance(other, list):
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index 4b59c26..a479e07 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -191,6 +191,14 @@
def update_metadata(self, md, data):
return ModalitySchemas.update_metadata(self.name, md, data)
+ def add_alignment(self, md, alignment_timestamps):
+ md["alignment_timestamps"] = alignment_timestamps
+ return md
+
+ def add_field(self, md, field, data):
+ md[field] = data
+ return md
+
def create_audio_metadata(self, sampling_rate, data):
md = deepcopy(self.get_schema())
md = ModalitySchemas.update_base_metadata(md, data, True)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 714fe42..c0ee705 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -37,7 +37,12 @@
:param data_loader: Defines how the raw data should be loaded
:param modality_type: Type of the modality
"""
- super().__init__(data_loader.modality_type, ModalityIdentifier().new_id(), None)
+ super().__init__(
+ data_loader.modality_type,
+ ModalityIdentifier().new_id(),
+ {},
+ data_loader.data_type,
+ )
self.data_loader = data_loader
def copy_from_instance(self):
@@ -84,9 +89,7 @@
if not self.has_data():
self.extract_raw_data()
- transformed_modality = TransformedModality(
- self.modality_type, context_operator.name, self.modality_id, self.metadata
- )
+ transformed_modality = TransformedModality(self, context_operator)
transformed_modality.data = context_operator.execute(self)
return transformed_modality
@@ -101,10 +104,8 @@
def apply_representation(self, representation):
new_modality = TransformedModality(
- self.modality_type,
- representation.name,
- self.modality_id,
- self.data_loader.metadata.copy(),
+ self,
+ representation,
)
new_modality.data = []
diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py
index 46e6b8b..9412c5b 100644
--- a/src/main/python/systemds/scuro/representations/aggregated_representation.py
+++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py
@@ -28,8 +28,6 @@
self.aggregation = aggregation
def transform(self, modality):
- aggregated_modality = TransformedModality(
- modality.modality_type, self.name, modality.modality_id, modality.metadata
- )
+ aggregated_modality = TransformedModality(modality, self)
aggregated_modality.data = self.aggregation.execute(modality)
return aggregated_modality
diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py
index 4c6b0e1..8a7e6b9 100644
--- a/src/main/python/systemds/scuro/representations/average.py
+++ b/src/main/python/systemds/scuro/representations/average.py
@@ -37,6 +37,8 @@
Combines modalities using averaging
"""
super().__init__("Average")
+ self.associative = True
+ self.commutative = True
def transform(self, modalities: List[Modality]):
for modality in modalities:
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 802d7e3..8d8d40f 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -22,11 +22,15 @@
from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
import torch
-from transformers import BertTokenizer, BertModel
+from transformers import BertTokenizerFast, BertModel
from systemds.scuro.representations.utils import save_embeddings
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.drsearch.operator_registry import register_representation
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
@register_representation(ModalityType.TEXT)
class Bert(UnimodalRepresentation):
@@ -38,17 +42,15 @@
self.output_file = output_file
def transform(self, modality):
- transformed_modality = TransformedModality(
- modality.modality_type, self, modality.modality_id, modality.metadata
- )
+ transformed_modality = TransformedModality(modality, self)
model_name = "bert-base-uncased"
- tokenizer = BertTokenizer.from_pretrained(
+ tokenizer = BertTokenizerFast.from_pretrained(
model_name, clean_up_tokenization_spaces=True
)
model = BertModel.from_pretrained(model_name)
- embeddings = self.create_embeddings(modality.data, model, tokenizer)
+ embeddings = self.create_embeddings(modality, model, tokenizer)
if self.output_file is not None:
save_embeddings(embeddings, self.output_file)
@@ -56,15 +58,29 @@
transformed_modality.data = embeddings
return transformed_modality
- def create_embeddings(self, data, model, tokenizer):
+ def create_embeddings(self, modality, model, tokenizer):
embeddings = []
- for d in data:
- inputs = tokenizer(d, return_tensors="pt", padding=True, truncation=True)
+ for i, d in enumerate(modality.data):
+ inputs = tokenizer(
+ d,
+ return_offsets_mapping=True,
+ return_tensors="pt",
+ padding=True,
+ truncation=True,
+ )
+
+ ModalityType.TEXT.add_field(
+ list(modality.metadata.values())[i],
+ "token_to_character_mapping",
+ inputs.data["offset_mapping"][0].tolist(),
+ )
+
+ del inputs.data["offset_mapping"]
with torch.no_grad():
outputs = model(**inputs)
- cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
- embeddings.append(cls_embedding.reshape(1, -1))
+ cls_embedding = outputs.last_hidden_state[0].numpy()
+ embeddings.append(cls_embedding)
return embeddings
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index e2bc940..6778811 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -39,9 +39,7 @@
self.output_file = output_file
def transform(self, modality):
- transformed_modality = TransformedModality(
- modality.modality_type, self, modality.modality_id, modality.metadata
- )
+ transformed_modality = TransformedModality(modality, self)
vectorizer = CountVectorizer(
ngram_range=(1, self.ngram_range), min_df=self.min_df
)
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py
index 1265563..c7ce33a 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -58,7 +58,9 @@
[
data,
pad_sequences(
- modality.data, maxlen=max_emb_size, dtype="float32"
+ modality.data,
+ maxlen=max_emb_size,
+ dtype=modality.data.dtype,
),
],
axis=-1,
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index 7734523..cbbb560 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -33,6 +33,9 @@
:param name: Name of the fusion type
"""
super().__init__(name, parameters)
+ self.associative = False
+ self.commutative = False
+ self.needs_alignment = False
def transform(self, modalities: List[Modality]):
"""
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index 66a6847..d948567 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -21,7 +21,7 @@
import numpy as np
from gensim.utils import tokenize
-
+from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
from systemds.scuro.representations.utils import save_embeddings
from systemds.scuro.modality.type import ModalityType
@@ -46,11 +46,12 @@
self.glove_path = glove_path
self.output_file = output_file
- def transform(self, data):
+ def transform(self, modality):
+ transformed_modality = TransformedModality(modality, self)
glove_embeddings = load_glove_embeddings(self.glove_path)
embeddings = []
- for sentences in data:
+ for sentences in modality.data:
tokens = list(tokenize(sentences.lower()))
embeddings.append(
np.mean(
@@ -66,4 +67,5 @@
if self.output_file is not None:
save_embeddings(np.array(embeddings), self.output_file)
- return np.array(embeddings)
+ transformed_modality.data = np.array(embeddings)
+ return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/multiplication.py b/src/main/python/systemds/scuro/representations/hadamard.py
similarity index 67%
rename from src/main/python/systemds/scuro/representations/multiplication.py
rename to src/main/python/systemds/scuro/representations/hadamard.py
index 8d1e7f8..138003b 100644
--- a/src/main/python/systemds/scuro/representations/multiplication.py
+++ b/src/main/python/systemds/scuro/representations/hadamard.py
@@ -24,7 +24,6 @@
import numpy as np
from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
from systemds.scuro.representations.fusion import Fusion
@@ -32,23 +31,18 @@
@register_fusion_operator()
-class Multiplication(Fusion):
+class Hadamard(Fusion):
def __init__(self):
"""
- Combines modalities using elementwise multiply
+ Combines modalities using elementwise multiply (Hadamard product)
"""
- super().__init__("Multiplication")
+ super().__init__("Hadamard")
+ self.needs_alignment = True # zero padding falsifies the result
+ self.commutative = True
+ self.associative = True
def transform(self, modalities: List[Modality], train_indices=None):
- max_emb_size = self.get_max_embedding_size(modalities)
+ # TODO: check for alignment in the metadata
+ fused_data = np.prod([m.data for m in modalities], axis=0)
- data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32")
-
- for m in range(1, len(modalities)):
- # scaled = self.scale_data(modalities[m].data, train_indices)
- data = np.multiply(
- data,
- pad_sequences(modalities[m].data, maxlen=max_emb_size, dtype="float32"),
- )
-
- return data
+ return fused_data
diff --git a/src/main/python/systemds/scuro/representations/image_bind.py b/src/main/python/systemds/scuro/representations/image_bind.py
new file mode 100644
index 0000000..e934d52
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/image_bind.py
@@ -0,0 +1,100 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import torch
+import imagebind.data as data
+
+from imagebind.models.imagebind_model import ModalityType as IBModalityType
+
+from imagebind.models import imagebind_model
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import save_embeddings
+
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.operator_registry import register_representation
+
+if torch.backends.mps.is_available():
+ DEVICE = torch.device("mps")
+# elif torch.cuda.is_available():
+# DEVICE = torch.device("cuda")
+else:
+ DEVICE = torch.device("cpu")
+
+
+# @register_representation([ModalityType.TEXT, ModalityType.AUDIO, ModalityType.VIDEO])
+class ImageBind(UnimodalRepresentation):
+ def __init__(self):
+ parameters = {}
+ super().__init__("ImageBind", ModalityType.EMBEDDING, parameters)
+ self.model = imagebind_model.imagebind_huge(pretrained=True)
+ for param in self.model.parameters():
+ param.requires_grad = False
+ self.model.eval()
+ self.model.to(DEVICE)
+
+ def transform(self, modality):
+ transformed_modality = TransformedModality(
+ modality, self, ModalityType.EMBEDDING
+ )
+
+ result = []
+ if modality.modality_type == ModalityType.TEXT:
+ for i, instance in enumerate(modality.data):
+ text_inputs = data.load_and_transform_text(instance, DEVICE)
+ text_embeddings = self.model({IBModalityType.TEXT: text_inputs})[
+ IBModalityType.TEXT
+ ]
+ result.append(text_embeddings.mean(axis=0).cpu().detach().numpy())
+ if modality.modality_type == ModalityType.AUDIO:
+ audio_inputs = data.load_and_transform_audio_data(
+ list(modality.metadata)[
+ (modality.data_loader.next_chunk - 1)
+ * (modality.data_loader.chunk_size) : (
+ modality.data_loader.next_chunk - 1
+ )
+ * (modality.data_loader.chunk_size)
+ + (modality.data_loader.chunk_size)
+ ],
+ DEVICE,
+ )
+ audio_embeddings = self.model({IBModalityType.AUDIO: audio_inputs})[
+ IBModalityType.AUDIO
+ ]
+ result.extend(audio_embeddings.cpu().detach().numpy())
+ if modality.modality_type == ModalityType.VIDEO:
+ video_inputs = data.load_and_transform_video_data(
+ list(modality.metadata)[
+ (modality.data_loader.next_chunk - 1)
+ * (modality.data_loader.chunk_size) : (
+ modality.data_loader.next_chunk - 1
+ )
+ * (modality.data_loader.chunk_size)
+ + (modality.data_loader.chunk_size)
+ ],
+ DEVICE,
+ )
+ video_embeddings = self.model({IBModalityType.VISION: video_inputs})[
+ IBModalityType.VISION
+ ]
+ result.extend(video_embeddings.cpu().detach().numpy())
+
+ transformed_modality.data = result
+ return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index a82a1e2..cbab0f6 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -18,6 +18,9 @@
# under the License.
#
# -------------------------------------------------------------
+import os
+import random
+
import torch
from torch import nn
@@ -31,6 +34,8 @@
from systemds.scuro.drsearch.operator_registry import register_fusion_operator
+# TODO: concatenate before embedding
+# Make this a hyperparameter
@register_fusion_operator()
class LSTM(Fusion):
def __init__(self, width=128, depth=1, dropout_rate=0.1):
@@ -42,8 +47,18 @@
self.width = width
self.dropout_rate = dropout_rate
self.unimodal_embeddings = {}
+ seed = 42
+
+ os.environ["PYTHONHASHSEED"] = str(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed(seed)
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
def transform(self, modalities: List[Modality]):
+ self.unimodal_embeddings = {}
size = len(modalities[0].data)
result = np.zeros((size, 0))
@@ -60,6 +75,9 @@
return result
def run_lstm(self, data):
+ if isinstance(data, list):
+ data = np.array(data)
+
d = data.astype(np.float32)
dim = d.shape[-1]
d = torch.from_numpy(d)
diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py
index 5a787dc..6ecf5fd 100644
--- a/src/main/python/systemds/scuro/representations/max.py
+++ b/src/main/python/systemds/scuro/representations/max.py
@@ -18,14 +18,11 @@
# under the License.
#
# -------------------------------------------------------------
-import itertools
from typing import List
import numpy as np
from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
-
from systemds.scuro.representations.fusion import Fusion
from systemds.scuro.drsearch.operator_registry import register_fusion_operator
@@ -33,52 +30,21 @@
@register_fusion_operator()
class RowMax(Fusion):
- def __init__(self, split=4):
+ def __init__(self):
"""
Combines modalities by computing the outer product of a modality combination and
taking the row max
"""
super().__init__("RowMax")
- self.split = split
+ self.needs_alignment = True
+ self.associative = True
+ self.commutative = True
def transform(
self,
modalities: List[Modality],
):
- if len(modalities) < 2:
- return np.array(modalities[0].data)
+ # TODO: need to check if data is aligned - same number of dimension
+ fused_data = np.maximum.reduce([m.data for m in modalities])
- max_emb_size = self.get_max_embedding_size(modalities)
-
- padded_modalities = []
- for modality in modalities:
- d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32")
- padded_modalities.append(d)
-
- split_rows = int(len(modalities[0].data) / self.split)
-
- data = []
-
- for combination in itertools.combinations(padded_modalities, 2):
- combined = None
- for i in range(0, self.split):
- start = split_rows * i
- end = (
- split_rows * (i + 1)
- if i < (self.split - 1)
- else len(modalities[0].data)
- )
- m = np.einsum(
- "bi,bo->bio", combination[0][start:end], combination[1][start:end]
- )
- m = m.max(axis=2)
- if combined is None:
- combined = m
- else:
- combined = np.concatenate((combined, m), axis=0)
- data.append(combined)
-
- data = np.stack(data)
- data = data.max(axis=0)
-
- return np.array(data)
+ return fused_data
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 4095cee..8c14c03 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -43,7 +43,7 @@
def transform(self, modality):
transformed_modality = TransformedModality(
- self.output_modality_type, self, modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
)
result = []
max_length = 0
diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py
index 75cc00d..234e932 100644
--- a/src/main/python/systemds/scuro/representations/mfcc.py
+++ b/src/main/python/systemds/scuro/representations/mfcc.py
@@ -45,7 +45,7 @@
def transform(self, modality):
transformed_modality = TransformedModality(
- self.output_modality_type, self, modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
)
result = []
max_length = 0
diff --git a/src/main/python/systemds/scuro/representations/optical_flow.py b/src/main/python/systemds/scuro/representations/optical_flow.py
index 1fb922d7..2781730 100644
--- a/src/main/python/systemds/scuro/representations/optical_flow.py
+++ b/src/main/python/systemds/scuro/representations/optical_flow.py
@@ -48,10 +48,7 @@
def transform(self, modality):
transformed_modality = TransformedModality(
- self.output_modality_type,
- "opticalFlow",
- modality.modality_id,
- modality.metadata,
+ modality, self, self.output_modality_type
)
for video_id, instance in enumerate(modality.data):
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 68771ec..bdfbfb1 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -18,10 +18,11 @@
# under the License.
#
# -------------------------------------------------------------
+from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype
from systemds.scuro.utils.torch_dataset import CustomDataset
from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from typing import Callable, Dict, Tuple, Any
+from typing import Tuple, Any
from systemds.scuro.drsearch.operator_registry import register_representation
import torch.utils.data
import torch
@@ -42,6 +43,7 @@
)
class ResNet(UnimodalRepresentation):
def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None):
+ self.data_type = torch.bfloat16
self.model_name = model_name
parameters = self._get_parameters()
super().__init__(
@@ -68,25 +70,38 @@
def model_name(self, model_name):
self._model_name = model_name
if model_name == "ResNet18":
- self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(
- DEVICE
+ self.model = (
+ models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
+ .to(DEVICE)
+ .to(self.data_type)
)
+
elif model_name == "ResNet34":
self.model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(
DEVICE
)
+ self.model = self.model.to(self.data_type)
elif model_name == "ResNet50":
- self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(
- DEVICE
+ self.model = (
+ models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
+ .to(DEVICE)
+ .to(self.data_type)
)
+
elif model_name == "ResNet101":
- self.model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(
- DEVICE
+ self.model = (
+ models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
+ .to(DEVICE)
+ .to(self.data_type)
)
+
elif model_name == "ResNet152":
- self.model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(
- DEVICE
+ self.model = (
+ models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
+ .to(DEVICE)
+ .to(self.data_type)
)
+
else:
raise NotImplementedError
@@ -110,7 +125,11 @@
return parameters
def transform(self, modality):
- dataset = CustomDataset(modality.data)
+ self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+ if next(self.model.parameters()).dtype != self.data_type:
+ self.model = self.model.to(self.data_type)
+
+ dataset = CustomDataset(modality.data, self.data_type, DEVICE)
embeddings = {}
res5c_output = None
@@ -132,7 +151,7 @@
for instance in torch.utils.data.DataLoader(dataset):
video_id = instance["id"][0]
- frames = instance["data"][0].to(DEVICE)
+ frames = instance["data"][0]
embeddings[video_id] = []
batch_size = 64
@@ -146,13 +165,18 @@
pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
embeddings[video_id].extend(
- torch.flatten(pooled, 1).detach().cpu().numpy()
+ torch.flatten(pooled, 1)
+ .detach()
+ .cpu()
+ .float()
+ .numpy()
+ .astype(modality.data_type)
)
embeddings[video_id] = np.array(embeddings[video_id])
transformed_modality = TransformedModality(
- self.output_modality_type, "resnet", modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
)
transformed_modality.data = list(embeddings.values())
diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py
index b5558b1..6a713a3 100644
--- a/src/main/python/systemds/scuro/representations/spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/spectrogram.py
@@ -38,7 +38,7 @@
def transform(self, modality):
transformed_modality = TransformedModality(
- self.output_modality_type, self, modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
)
result = []
max_length = 0
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index c17527b..1df5a1f 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -38,9 +38,7 @@
self.output_file = output_file
def transform(self, modality):
- transformed_modality = TransformedModality(
- modality.modality_type, self, modality.modality_id, modality.metadata
- )
+ transformed_modality = TransformedModality(modality, self)
vectorizer = TfidfVectorizer(min_df=self.min_df)
diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py
index bf251b1..29f5bcb 100644
--- a/src/main/python/systemds/scuro/representations/wav2vec.py
+++ b/src/main/python/systemds/scuro/representations/wav2vec.py
@@ -46,7 +46,7 @@
def transform(self, modality):
transformed_modality = TransformedModality(
- self.output_modality_type, self, modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
)
result = []
diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
similarity index 100%
rename from src/main/python/systemds/scuro/representations/window.py
rename to src/main/python/systemds/scuro/representations/window_aggregation.py
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index e1d1669..0210207 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -54,9 +54,7 @@
self.output_file = output_file
def transform(self, modality):
- transformed_modality = TransformedModality(
- modality.modality_type, self, modality.modality_id, modality.metadata
- )
+ transformed_modality = TransformedModality(modality, self)
t = [list(tokenize(s.lower())) for s in modality.data]
model = Word2Vec(
sentences=t,
diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py
index bb5d1ec..1629ac6 100644
--- a/src/main/python/systemds/scuro/representations/x3d.py
+++ b/src/main/python/systemds/scuro/representations/x3d.py
@@ -30,11 +30,12 @@
import numpy as np
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.drsearch.operator_registry import register_representation
+import math
if torch.backends.mps.is_available():
DEVICE = torch.device("mps")
-# elif torch.cuda.is_available():
-# DEVICE = torch.device("cuda")
+elif torch.cuda.is_available():
+ DEVICE = torch.device("cuda")
else:
DEVICE = torch.device("cpu")
@@ -127,7 +128,74 @@
embeddings[video_id] = np.array(embeddings[video_id])
transformed_modality = TransformedModality(
- self.output_modality_type, "x3d", modality.modality_id, modality.metadata
+ modality, self, self.output_modality_type
+ )
+
+ transformed_modality.data = list(embeddings.values())
+
+ return transformed_modality
+
+
+class I3D(UnimodalRepresentation):
+ def __init__(self, layer="avgpool", model_name="i3d", output_file=None):
+ self.model_name = model_name
+ parameters = self._get_parameters()
+ self.model = torch.hub.load(
+ "facebookresearch/pytorchvideo", "i3d_r50", pretrained=True
+ ).to(DEVICE)
+ super().__init__("I3D", ModalityType.TIMESERIES, parameters)
+
+ self.output_file = output_file
+ self.layer_name = layer
+ self.model.eval()
+ for param in self.model.parameters():
+ param.requires_grad = False
+
+ def _get_parameters(self, high_level=True):
+ parameters = {"model_name": [], "layer_name": []}
+ for m in ["r3d", "s3d"]:
+ parameters["model_name"].append(m)
+
+ if high_level:
+ parameters["layer_name"] = [
+ "conv1",
+ "layer1",
+ "layer2",
+ "layer3",
+ "layer4",
+ "avgpool",
+ ]
+ else:
+ for name, layer in self.model.named_modules():
+ parameters["layer_name"].append(name)
+ return parameters
+
+ def transform(self, modality):
+ dataset = CustomDataset(modality.data, torch.float32, DEVICE)
+ embeddings = {}
+
+ features = None
+
+ def hook(module, input, output):
+ pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze()
+ nonlocal features
+ features = pooled.detach().cpu().numpy()
+
+ handle = self.model.blocks[6].dropout.register_forward_hook(hook)
+
+ for instance in dataset:
+ video_id = instance["id"]
+ frames = instance["data"].to(DEVICE)
+ embeddings[video_id] = []
+
+ batch = torch.transpose(frames, 1, 0)
+ batch = batch.unsqueeze(0)
+ _ = self.model(batch)
+
+ embeddings[video_id] = features
+
+ transformed_modality = TransformedModality(
+ modality, self, self.output_modality_type
)
transformed_modality.data = list(embeddings.values())
diff --git a/src/main/python/systemds/scuro/utils/converter.py b/src/main/python/systemds/scuro/utils/converter.py
new file mode 100644
index 0000000..030fc4a
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/converter.py
@@ -0,0 +1,49 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import numpy as np
+import torch
+
+
+def numpy_dtype_to_torch_dtype(dtype):
+ """
+ Convert a NumPy dtype (or dtype string) to the corresponding PyTorch dtype.
+ Raises ValueError if the dtype is not supported.
+ """
+ if isinstance(dtype, torch.dtype):
+ return dtype
+
+ mapping = {
+ np.float32: torch.float32,
+ np.float64: torch.float64,
+ np.float16: torch.bfloat16,
+ np.uint8: torch.uint8,
+ np.int8: torch.int8,
+ np.int16: torch.int16,
+ np.int32: torch.int32,
+ np.int64: torch.int64,
+ }
+
+ np_dtype = np.dtype(dtype)
+ if np_dtype.type in mapping:
+ return mapping[np_dtype.type]
+ else:
+ raise ValueError(f"No corresponding torch dtype for NumPy dtype {np_dtype}")
diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py
index a0f3d88..c04be0e 100644
--- a/src/main/python/systemds/scuro/utils/torch_dataset.py
+++ b/src/main/python/systemds/scuro/utils/torch_dataset.py
@@ -20,20 +20,26 @@
# -------------------------------------------------------------
from typing import Dict
-import numpy as np
import torch
import torchvision.transforms as transforms
class CustomDataset(torch.utils.data.Dataset):
- def __init__(self, data):
+ def __init__(self, data, data_type, device, size=None):
self.data = data
+ self.data_type = data_type
+ self.device = device
+ self.size = size
+ if size is None:
+ self.size = (256, 224)
+
self.tf = transforms.Compose(
[
transforms.ToPILImage(),
- transforms.Resize(256),
- transforms.CenterCrop(224),
+ transforms.Resize(self.size[0]),
+ transforms.CenterCrop(self.size[1]),
transforms.ToTensor(),
+ transforms.ConvertImageDtype(dtype=self.data_type),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
),
@@ -42,20 +48,18 @@
def __getitem__(self, index) -> Dict[str, object]:
data = self.data[index]
- if type(data) is np.ndarray:
- output = torch.empty((1, 3, 224, 224))
- d = torch.tensor(data)
- d = d.repeat(3, 1, 1)
- output[0] = self.tf(d)
- else:
- output = torch.empty((len(data), 3, 224, 224))
+ output = torch.empty(
+ (len(data), 3, self.size[1], self.size[1]),
+ dtype=self.data_type,
+ device=self.device,
+ )
- for i, d in enumerate(data):
- if data[0].ndim < 3:
- d = torch.tensor(d)
- d = d.repeat(3, 1, 1)
+ for i, d in enumerate(data):
+ if data[0].ndim < 3:
+ d = torch.tensor(d)
+ d = d.repeat(3, 1, 1)
- output[i] = self.tf(d)
+ output[i] = self.tf(d)
return {"id": index, "data": output}
diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py
index 05c9bf0..887b314 100644
--- a/src/main/python/systemds/utils/helpers.py
+++ b/src/main/python/systemds/utils/helpers.py
@@ -23,7 +23,7 @@
from importlib.util import find_spec
from itertools import chain
from typing import Dict, Iterable
-
+import torch
from systemds.utils.consts import MODULE_NAME
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index e31887f..fbb50ac 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -26,6 +26,7 @@
import random
import os
+from systemds.scuro.dataloader.base_loader import BaseLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
from systemds.scuro.dataloader.audio_loader import AudioLoader
from systemds.scuro.dataloader.text_loader import TextLoader
@@ -34,10 +35,31 @@
from systemds.scuro.modality.type import ModalityType
+class TestDataLoader(BaseLoader):
+ def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata):
+ super().__init__("", indices, data_type, chunk_size, modality_type)
+
+ self.metadata = metadata
+ self.test_data = data
+
+ def reset(self):
+ self._next_chunk = 0
+ self.data = []
+
+ def extract(self, file, indices):
+ if isinstance(self.test_data, list):
+ self.data = [self.test_data[i] for i in indices]
+ else:
+ self.data = self.test_data[indices]
+
+
class ModalityRandomDataGenerator:
def __init__(self):
- self._modality_id = 0
+ self.modality_id = 0
+ self.modality_type = None
+ self.metadata = {}
+ self.data_type = np.float32
def create1DModality(
self,
@@ -45,32 +67,125 @@
num_features,
modality_type,
):
- data = np.random.rand(num_instances, num_features)
+ data = np.random.rand(num_instances, num_features).astype(self.data_type)
+ data.dtype = self.data_type
+
# TODO: write a dummy method to create the same metadata for all instances to avoid the for loop
- metadata = {}
+ self.modality_type = modality_type
for i in range(num_instances):
if modality_type == ModalityType.AUDIO:
- metadata[i] = modality_type.create_audio_metadata(
+ self.metadata[i] = modality_type.create_audio_metadata(
num_features / 10, data[i]
)
elif modality_type == ModalityType.TEXT:
- metadata[i] = modality_type.create_text_metadata(
+ self.metadata[i] = modality_type.create_text_metadata(
num_features / 10, data[i]
)
elif modality_type == ModalityType.VIDEO:
- metadata[i] = modality_type.create_video_metadata(
+ self.metadata[i] = modality_type.create_video_metadata(
num_features / 30, 10, 0, 0, 1
)
else:
raise NotImplementedError
- tf_modality = TransformedModality(
- modality_type, "test_transformation", self._modality_id, metadata
- )
+ tf_modality = TransformedModality(self, "test_transformation")
tf_modality.data = data
- self._modality_id += 1
+ self.modality_id += 1
return tf_modality
+ def create_audio_data(self, num_instances, num_features):
+ data = np.random.rand(num_instances, num_features).astype(np.float32)
+ metadata = {
+ i: ModalityType.AUDIO.create_audio_metadata(16000, data[i])
+ for i in range(num_instances)
+ }
+
+ return data, metadata
+
+ def create_text_data(self, num_instances):
+ subjects = [
+ "The cat",
+ "A dog",
+ "The student",
+ "The teacher",
+ "The bird",
+ "The child",
+ "The programmer",
+ "The scientist",
+ "A researcher",
+ ]
+ verbs = [
+ "reads",
+ "writes",
+ "studies",
+ "analyzes",
+ "creates",
+ "develops",
+ "designs",
+ "implements",
+ "examines",
+ ]
+ objects = [
+ "the document",
+ "the code",
+ "the data",
+ "the problem",
+ "the solution",
+ "the project",
+ "the research",
+ "the paper",
+ ]
+ adverbs = [
+ "carefully",
+ "quickly",
+ "efficiently",
+ "thoroughly",
+ "diligently",
+ "precisely",
+ "methodically",
+ ]
+
+ sentences = []
+ for _ in range(num_instances):
+ include_adverb = np.random.random() < 0.7
+
+ subject = np.random.choice(subjects)
+ verb = np.random.choice(verbs)
+ obj = np.random.choice(objects)
+ adverb = np.random.choice(adverbs) if include_adverb else ""
+
+ sentence = f"{subject} {adverb} {verb} {obj}"
+
+ sentences.append(sentence)
+
+ metadata = {
+ i: ModalityType.TEXT.create_text_metadata(len(sentences[i]), sentences[i])
+ for i in range(num_instances)
+ }
+
+ return sentences, metadata
+
+ def create_visual_modality(self, num_instances, num_frames=1, height=28, width=28):
+ if num_frames == 1:
+ print(f"TODO: create image metadata")
+ else:
+ metadata = {
+ i: ModalityType.VIDEO.create_video_metadata(
+ 30, num_frames, width, height, 1
+ )
+ for i in range(num_instances)
+ }
+
+ return (
+ np.random.randint(
+ 0,
+ 256,
+ (num_instances, num_frames, height, width),
+ # ).astype(np.float16).tolist(),
+ ).astype(np.float16),
+ metadata,
+ )
+
def setup_data(modalities, num_instances, path):
if os.path.isdir(path):
@@ -202,7 +317,7 @@
def __create_audio_data(self, idx, duration, speed_factor):
path = f"{self.path}/AUDIO/{idx}.wav"
- sample_rate = 44100
+ sample_rate = 16000
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
frequency_variation = random.uniform(200.0, 500.0)
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index 521ff3f..50f57ee 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -38,10 +38,10 @@
from systemds.scuro.representations.lstm import LSTM
from systemds.scuro.representations.max import RowMax
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
-from systemds.scuro.representations.multiplication import Multiplication
+from systemds.scuro.representations.hadamard import Hadamard
from systemds.scuro.representations.resnet import ResNet
from systemds.scuro.representations.sum import Sum
-from tests.scuro.data_generator import setup_data
+from tests.scuro.data_generator import ModalityRandomDataGenerator
import warnings
@@ -91,36 +91,27 @@
@classmethod
def setUpClass(cls):
- cls.test_file_path = "test_data_dr_search"
cls.num_instances = 20
- modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+ cls.data_generator = ModalityRandomDataGenerator()
- cls.data_generator = setup_data(
- modalities, cls.num_instances, cls.test_file_path
- )
- os.makedirs(f"{cls.test_file_path}/embeddings")
-
+ cls.labels = np.random.choice([0, 1], size=cls.num_instances)
# TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
- cls.bert = cls.data_generator.modalities_by_type[
- ModalityType.TEXT
- ].apply_representation(Bert())
- cls.mel_spe = (
- cls.data_generator.modalities_by_type[ModalityType.AUDIO]
- .apply_representation(MelSpectrogram())
- .flatten()
+ cls.video = cls.data_generator.create1DModality(
+ cls.num_instances, 100, ModalityType.VIDEO
)
- cls.resnet = (
- cls.data_generator.modalities_by_type[ModalityType.VIDEO]
- .apply_representation(ResNet())
- .window(10, "mean")
- .flatten()
+ cls.text = cls.data_generator.create1DModality(
+ cls.num_instances, 100, ModalityType.TEXT
)
- cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
+ cls.audio = cls.data_generator.create1DModality(
+ cls.num_instances, 100, ModalityType.AUDIO
+ )
+
+ cls.mods = [cls.video, cls.audio, cls.text]
split = train_test_split(
- cls.data_generator.indices,
- cls.data_generator.labels,
+ np.array(range(cls.num_instances)),
+ cls.labels,
test_size=0.2,
random_state=42,
)
@@ -134,22 +125,17 @@
cls.representations = [
Concatenation(),
Average(),
- RowMax(100),
- Multiplication(),
+ RowMax(),
+ Hadamard(),
Sum(),
LSTM(width=256, depth=3),
]
- @classmethod
- def tearDownClass(cls):
- print("Cleaning up test data")
- shutil.rmtree(cls.test_file_path)
-
def test_enumerate_all(self):
task = Task(
"TestTask",
TestSVM(),
- self.data_generator.labels,
+ self.labels,
self.train_indizes,
self.val_indizes,
)
@@ -164,7 +150,7 @@
task = Task(
"TestTask",
TestSVM(),
- self.data_generator.labels,
+ self.labels,
self.train_indizes,
self.val_indizes,
)
diff --git a/src/main/python/tests/scuro/test_fusion_orders.py b/src/main/python/tests/scuro/test_fusion_orders.py
new file mode 100644
index 0000000..eb01d18
--- /dev/null
+++ b/src/main/python/tests/scuro/test_fusion_orders.py
@@ -0,0 +1,95 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import os
+import shutil
+import unittest
+import numpy as np
+
+from systemds.scuro import Concatenation, RowMax, Hadamard
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.average import Average
+from tests.scuro.data_generator import ModalityRandomDataGenerator
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestFusionOrders(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.num_instances = 40
+ cls.data_generator = ModalityRandomDataGenerator()
+ cls.r_1 = cls.data_generator.create1DModality(40, 100, ModalityType.AUDIO)
+ cls.r_2 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT)
+ cls.r_3 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT)
+
+ def test_fusion_order_avg(self):
+ r_1_r_2 = self.r_1.combine(self.r_2, Average())
+ r_2_r_1 = self.r_2.combine(self.r_1, Average())
+ r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Average())
+ r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Average())
+
+ r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Average())
+
+ self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data))
+ self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data))
+ self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data))
+ self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data))
+
+ def test_fusion_order_concat(self):
+ r_1_r_2 = self.r_1.combine(self.r_2, Concatenation())
+ r_2_r_1 = self.r_2.combine(self.r_1, Concatenation())
+ r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Concatenation())
+ r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Concatenation())
+
+ r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Concatenation())
+
+ self.assertFalse(np.array_equal(r_1_r_2.data, r_2_r_1.data))
+ self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data))
+ self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data))
+ self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data))
+
+ def test_fusion_order_max(self):
+ r_1_r_2 = self.r_1.combine(self.r_2, RowMax())
+ r_2_r_1 = self.r_2.combine(self.r_1, RowMax())
+ r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, RowMax())
+ r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, RowMax())
+
+ r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], RowMax())
+
+ self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data))
+ self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data))
+ self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data))
+ self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data))
+
+ def test_fusion_order_hadamard(self):
+ r_1_r_2 = self.r_1.combine(self.r_2, Hadamard())
+ r_2_r_1 = self.r_2.combine(self.r_1, Hadamard())
+ r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Hadamard())
+ r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Hadamard())
+
+ r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Hadamard())
+
+ self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data))
+ self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data))
+ self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data))
+ self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data))
diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py
index 8456279..77f0305 100644
--- a/src/main/python/tests/scuro/test_multimodal_fusion.py
+++ b/src/main/python/tests/scuro/test_multimodal_fusion.py
@@ -42,7 +42,11 @@
from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import setup_data
+from tests.scuro.data_generator import (
+ setup_data,
+ TestDataLoader,
+ ModalityRandomDataGenerator,
+)
from systemds.scuro.dataloader.audio_loader import AudioLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
@@ -109,15 +113,14 @@
@classmethod
def setUpClass(cls):
- cls.test_file_path = "fusion_optimizer_test_data"
-
cls.num_instances = 10
cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+ cls.labels = np.random.choice([0, 1], size=cls.num_instances)
+ cls.indices = np.array(range(cls.num_instances))
- cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
split = train_test_split(
- cls.data_generator.indices,
- cls.data_generator.labels,
+ cls.indices,
+ cls.labels,
test_size=0.2,
random_state=42,
)
@@ -129,48 +132,52 @@
Task(
"UnimodalRepresentationTask1",
TestSVM(),
- cls.data_generator.labels,
+ cls.labels,
cls.train_indizes,
cls.val_indizes,
),
Task(
"UnimodalRepresentationTask2",
TestCNN(),
- cls.data_generator.labels,
+ cls.labels,
cls.train_indizes,
cls.val_indizes,
),
]
- @classmethod
- def tearDownClass(cls):
- shutil.rmtree(cls.test_file_path)
-
def test_multimodal_fusion(self):
task = Task(
"UnimodalRepresentationTask1",
TestSVM(),
- self.data_generator.labels,
+ self.labels,
self.train_indizes,
self.val_indizes,
)
- audio_data_loader = AudioLoader(
- self.data_generator.get_modality_path(ModalityType.AUDIO),
- self.data_generator.indices,
- )
- audio = UnimodalModality(audio_data_loader)
- text_data_loader = TextLoader(
- self.data_generator.get_modality_path(ModalityType.TEXT),
- self.data_generator.indices,
+ audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+ self.num_instances, 100
)
- text = UnimodalModality(text_data_loader)
-
- video_data_loader = VideoLoader(
- self.data_generator.get_modality_path(ModalityType.VIDEO),
- self.data_generator.indices,
+ text_data, text_md = ModalityRandomDataGenerator().create_text_data(
+ self.num_instances
)
- video = UnimodalModality(video_data_loader)
+ video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+ self.num_instances, 60
+ )
+ audio = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+ )
+ )
+ video = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
+ )
+ )
+ text = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.TEXT, text_data, str, text_md
+ )
+ )
with patch.object(
Registry,
@@ -200,3 +207,7 @@
debug=False,
)
multimodal_optimizer.optimize()
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
index a5e3a7c..9e3a16f 100644
--- a/src/main/python/tests/scuro/test_multimodal_join.py
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -23,11 +23,13 @@
import shutil
import unittest
+import numpy as np
+import copy
from systemds.scuro.modality.joined import JoinCondition
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import setup_data
+from tests.scuro.data_generator import TestDataLoader, ModalityRandomDataGenerator
from systemds.scuro.dataloader.audio_loader import AudioLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
@@ -46,16 +48,15 @@
@classmethod
def setUpClass(cls):
- cls.test_file_path = "join_test_data"
cls.num_instances = 4
- cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO]
+ cls.indices = np.array(range(cls.num_instances))
+ cls.audio_data, cls.audio_md = ModalityRandomDataGenerator().create_audio_data(
+ cls.num_instances, 32000
+ )
- cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
-
- @classmethod
- def tearDownClass(cls):
- print("Cleaning up test data")
- shutil.rmtree(cls.test_file_path)
+ cls.video_data, cls.video_md = (
+ ModalityRandomDataGenerator().create_visual_modality(cls.num_instances, 60)
+ )
def test_video_audio_join(self):
self._execute_va_join()
@@ -91,19 +92,26 @@
self._join(audio, video, 2)
def _prepare_data(self, l_chunk_size=None, r_chunk_size=None):
- video_data_loader = VideoLoader(
- self.data_generator.get_modality_path(ModalityType.VIDEO),
- self.data_generator.indices,
- chunk_size=l_chunk_size,
+ audio = UnimodalModality(
+ TestDataLoader(
+ self.indices,
+ r_chunk_size,
+ ModalityType.AUDIO,
+ copy.deepcopy(self.audio_data),
+ np.float32,
+ copy.deepcopy(self.audio_md),
+ )
)
- video = UnimodalModality(video_data_loader)
-
- audio_data_loader = AudioLoader(
- self.data_generator.get_modality_path(ModalityType.AUDIO),
- self.data_generator.indices,
- r_chunk_size,
+ video = UnimodalModality(
+ TestDataLoader(
+ self.indices,
+ l_chunk_size,
+ ModalityType.VIDEO,
+ copy.deepcopy(self.video_data),
+ np.float32,
+ copy.deepcopy(self.video_md),
+ )
)
- audio = UnimodalModality(audio_data_loader)
mel_audio = audio.apply_representation(MelSpectrogram())
@@ -114,8 +122,8 @@
left_modality.join(
right_modality, JoinCondition("timestamp", "timestamp", "<")
)
- .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50"))
- .window(window_size, "mean")
+ .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet18"))
+ .window_aggregation(window_size, "mean")
.combine("concat")
)
diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py
index aaecde2..7f2a752 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -23,7 +23,7 @@
from systemds.scuro.representations.mfcc import MFCC
from systemds.scuro.representations.wav2vec import Wav2Vec
-from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.representations.window_aggregation import WindowAggregation
from systemds.scuro.representations.bow import BoW
from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.representations.tfidf import TfIdf
@@ -36,7 +36,7 @@
from systemds.scuro.representations.max import RowMax
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.spectrogram import Spectrogram
-from systemds.scuro.representations.multiplication import Multiplication
+from systemds.scuro.representations.hadamard import Hadamard
from systemds.scuro.representations.resnet import ResNet
from systemds.scuro.representations.sum import Sum
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index bfc52f0..9ed034e 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -39,7 +39,7 @@
from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import setup_data
+from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
from systemds.scuro.dataloader.audio_loader import AudioLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
@@ -101,21 +101,19 @@
class TestUnimodalRepresentationOptimizer(unittest.TestCase):
- test_file_path = None
data_generator = None
num_instances = 0
@classmethod
def setUpClass(cls):
- cls.test_file_path = "unimodal_optimizer_test_data"
-
cls.num_instances = 10
cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+ cls.labels = np.random.choice([0, 1], size=cls.num_instances)
+ cls.indices = np.array(range(cls.num_instances))
- cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
split = train_test_split(
- cls.data_generator.indices,
- cls.data_generator.labels,
+ cls.indices,
+ cls.labels,
test_size=0.2,
random_state=42,
)
@@ -127,46 +125,51 @@
Task(
"UnimodalRepresentationTask1",
TestSVM(),
- cls.data_generator.labels,
+ cls.labels,
cls.train_indizes,
cls.val_indizes,
),
Task(
"UnimodalRepresentationTask2",
TestCNN(),
- cls.data_generator.labels,
+ cls.labels,
cls.train_indizes,
cls.val_indizes,
),
]
- @classmethod
- def tearDownClass(cls):
- shutil.rmtree(cls.test_file_path)
-
def test_unimodal_optimizer_for_audio_modality(self):
- audio_data_loader = AudioLoader(
- self.data_generator.get_modality_path(ModalityType.AUDIO),
- self.data_generator.indices,
+ audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+ self.num_instances, 100
)
- audio = UnimodalModality(audio_data_loader)
+ audio = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+ )
+ )
self.optimize_unimodal_representation_for_modality(audio)
def test_unimodal_optimizer_for_text_modality(self):
- text_data_loader = TextLoader(
- self.data_generator.get_modality_path(ModalityType.TEXT),
- self.data_generator.indices,
+ text_data, text_md = ModalityRandomDataGenerator().create_text_data(
+ self.num_instances
)
- text = UnimodalModality(text_data_loader)
+ text = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.TEXT, text_data, str, text_md
+ )
+ )
self.optimize_unimodal_representation_for_modality(text)
def test_unimodal_optimizer_for_video_modality(self):
- video_data_loader = VideoLoader(
- self.data_generator.get_modality_path(ModalityType.VIDEO),
- self.data_generator.indices,
+ video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+ self.num_instances, 60
)
- video = UnimodalModality(video_data_loader)
+ video = UnimodalModality(
+ TestDataLoader(
+ self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
+ )
+ )
self.optimize_unimodal_representation_for_modality(video)
def optimize_unimodal_representation_for_modality(self, modality):
@@ -201,3 +204,7 @@
)
>= 1
)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
index ac167e8..2f2e64e 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -29,6 +29,7 @@
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.bert import Bert
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.mfcc import MFCC
from systemds.scuro.representations.resnet import ResNet
from tests.scuro.data_generator import setup_data
@@ -63,7 +64,7 @@
shutil.rmtree(cls.test_file_path)
def test_audio_representations(self):
- audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN
+ audio_representations = [MFCC()] # TODO: add FFT, TFN, 1DCNN
audio_data_loader = AudioLoader(
self.data_generator.get_modality_path(ModalityType.AUDIO),
self.data_generator.indices,
diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py
index d7210dd..ea1b0f4 100644
--- a/src/main/python/tests/scuro/test_window_operations.py
+++ b/src/main/python/tests/scuro/test_window_operations.py
@@ -51,7 +51,7 @@
def run_window_operations_for_modality(self, modality_type, window_size):
r = self.data_generator.create1DModality(40, 100, modality_type)
for aggregation in self.aggregations:
- windowed_modality = r.window(window_size, aggregation)
+ windowed_modality = r.window_aggregation(window_size, aggregation)
self.verify_window_operation(aggregation, r, windowed_modality, window_size)