Commit e7123489 authored by Oleh Astappiev's avatar Oleh Astappiev
Browse files

feat: better way to store embeddings compressed

parent 5fb38925
...@@ -2,6 +2,7 @@ import sys ...@@ -2,6 +2,7 @@ import sys
sys.path.append("..") sys.path.append("..")
from src.model.alexnet import AlexNetModel, TARGET_SHAPE from src.model.alexnet import AlexNetModel, TARGET_SHAPE
from src.data.simple3 import Simple3
from src.data.imagenette import Imagenette from src.data.imagenette import Imagenette
from src.data.cifar10 import Cifar10 from src.data.cifar10 import Cifar10
from src.utils.embeddings import project_embeddings, load_weights_of, get_embeddings_of, save_vectors from src.utils.embeddings import project_embeddings, load_weights_of, get_embeddings_of, save_vectors
...@@ -9,6 +10,7 @@ from src.model.siamese import SiameseModel ...@@ -9,6 +10,7 @@ from src.model.siamese import SiameseModel
dataset = Imagenette(image_size=TARGET_SHAPE, map_fn=AlexNetModel.preprocess_input) dataset = Imagenette(image_size=TARGET_SHAPE, map_fn=AlexNetModel.preprocess_input)
# dataset = Cifar10(image_size=TARGET_SHAPE, map_fn=AlexNetModel.preprocess_input) # dataset = Cifar10(image_size=TARGET_SHAPE, map_fn=AlexNetModel.preprocess_input)
# dataset = Simple3(image_size=TARGET_SHAPE, map_fn=AlexNetModel.preprocess_input)
model = AlexNetModel() model = AlexNetModel()
model.compile() model.compile()
......
...@@ -26,7 +26,6 @@ class Cifar10(AsbDataset): ...@@ -26,7 +26,6 @@ class Cifar10(AsbDataset):
label_mode='int', label_mode='int',
batch_size=batch_size, batch_size=batch_size,
image_size=image_size, image_size=image_size,
shuffle=False,
interpolation='nearest' interpolation='nearest'
) )
......
...@@ -26,7 +26,6 @@ class Imagenette(AsbDataset): ...@@ -26,7 +26,6 @@ class Imagenette(AsbDataset):
label_mode='int', label_mode='int',
batch_size=batch_size, batch_size=batch_size,
image_size=image_size, image_size=image_size,
shuffle=False,
interpolation='nearest' interpolation='nearest'
) )
......
...@@ -15,25 +15,14 @@ TRAIN_BATCH_SIZE = 128 ...@@ -15,25 +15,14 @@ TRAIN_BATCH_SIZE = 128
STEPS_PER_EPOCH = 100 # TODO: try restore 1000 STEPS_PER_EPOCH = 100 # TODO: try restore 1000
@tf.function
def make_label_for_pair(embeddings, labels):
# embedding_1, label_1 = tuple_1
# embedding_2, label_2 = tuple_2
return (embeddings[0, :], embeddings[1, :]), tf.cast(labels[0] == labels[1], tf.float32)
class SiameseModel(Model): class SiameseModel(Model):
""" Filippo's Siamese model """ Filippo's Siamese model
The `projection_model` is the part of the network that generates the final image vector (currently, a simple Dense layer with tanh activation, but it can be as complex as needed). The `embedding_model` is a model used to extract embeddings, also used to create `inference_model`.
The `projection_model` is the part of the network that generates the final image vector, uses embeddings as input.
The `siamese` model is the one we train. It applies the projection model to two embeddings, calculates the euclidean distance between the two generated image vectors and calculates the contrastive loss. The `inference_model` is combined model, uses `embedding_model`'s input and `projection_model`'s output.
The `siamese` model is the one we train. It applies the projection model to two embeddings,
As a note, [here](https://towardsdatascience.com/contrastive-loss-explaned-159f2d4a87ec) they mention that cosine distance is preferable to euclidean distance: calculates the euclidean distance between the two generated image vectors and calculates the contrastive loss.
> in a large dimensional space, all points tend to be far apart by the euclidian measure. In higher dimensions, the angle between vectors is a more effective measure.
Note that, when using cosine distance, the margin needs to be reduced from its default value of 1 (see below).
""" """
def __init__(self, embedding_model, image_vector_dimensions=IMAGE_VECTOR_DIMENSIONS): def __init__(self, embedding_model, image_vector_dimensions=IMAGE_VECTOR_DIMENSIONS):
...@@ -46,7 +35,6 @@ class SiameseModel(Model): ...@@ -46,7 +35,6 @@ class SiameseModel(Model):
emb_input_1 = layers.Input(self.embedding_vector_dimension) emb_input_1 = layers.Input(self.embedding_vector_dimension)
emb_input_2 = layers.Input(self.embedding_vector_dimension) emb_input_2 = layers.Input(self.embedding_vector_dimension)
""" Projection model is a model from embeddings to image vector """
# projection model is the one to use for queries (put in a sequence after the embedding-generator model above) # projection model is the one to use for queries (put in a sequence after the embedding-generator model above)
self.projection_model = tf.keras.models.Sequential([ self.projection_model = tf.keras.models.Sequential([
# layers.Dense(image_vector_dimensions, activation=ACTIVATION_FN, input_shape=(embedding_vector_dimension,)) # layers.Dense(image_vector_dimensions, activation=ACTIVATION_FN, input_shape=(embedding_vector_dimension,))
...@@ -57,6 +45,11 @@ class SiameseModel(Model): ...@@ -57,6 +45,11 @@ class SiameseModel(Model):
v1 = self.projection_model(emb_input_1) v1 = self.projection_model(emb_input_1)
v2 = self.projection_model(emb_input_2) v2 = self.projection_model(emb_input_2)
# As a note, [here](https://towardsdatascience.com/contrastive-loss-explaned-159f2d4a87ec) they mention that
# cosine distance is preferable to euclidean distance: in a large dimensional space, all points tend to be far
# apart by the euclidian measure. In higher dimensions, the angle between vectors is a more effective measure.
# Note that, when using cosine distance, the margin needs to be reduced from its default value of 1 (see below)
computed_distance = layers.Lambda(cosine_distance)([v1, v2]) computed_distance = layers.Lambda(cosine_distance)([v1, v2])
# computed_distance = layers.Lambda(euclidean_distance)([v1, v2]) # computed_distance = layers.Lambda(euclidean_distance)([v1, v2])
...@@ -84,26 +77,26 @@ class SiameseModel(Model): ...@@ -84,26 +77,26 @@ class SiameseModel(Model):
@staticmethod @staticmethod
def prepare_dataset(emb_vectors, emb_labels): def prepare_dataset(emb_vectors, emb_labels):
""" """
We already have the embeddings precomputed in `embeddings` and their matching `labels`. To train the siamese networks, we need to generate random pairs of embeddings, assigning as target `1` if the two come from the same class and `0` otherwise. To train the siamese networks, we need to generate random pairs of embeddings,
assigning as target `1` if the two come from the same class and `0` otherwise.
In order to keep the training balanced, we can't simply select two random `(embedding, label)` tuples from the dataset, because this is heavily unbalanced towards the negative class. To keep thing simple, we'll randomly select two samples and then use `rejection_resample` to rebalance the classes.
**NOTE**: rejection resampling works only if the number of classes is reasonably low: with 10 classes there's a 90% probability that a sample will be rejected, it can get very inefficient very quickly if the number of classes is too great.
""" """
# zip together embeddings and their labels, cache in memory (maybe not necessary or maybe faster this way), shuffle, repeat forever. # zip together embeddings and labels, cache in memory (maybe not necessary), shuffle, repeat forever
embeddings_ds = tf.data.Dataset.zip(( embeddings_ds = tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(emb_vectors), tf.data.Dataset.from_tensor_slices(emb_vectors),
tf.data.Dataset.from_tensor_slices(emb_labels) tf.data.Dataset.from_tensor_slices(emb_labels)
)).cache().shuffle(1000).repeat() )).cache().shuffle(1000).repeat()
@tf.function
def make_label_for_pair(embeddings, labels):
return (embeddings[0, :], embeddings[1, :]), tf.cast(labels[0] == labels[1], tf.uint8)
# because of shuffling, we can take two adjacent tuples as a randomly matched pair # because of shuffling, we can take two adjacent tuples as a randomly matched pair
train_ds = embeddings_ds.window(2, drop_remainder=True) # each "window" is a dataset that contains a subset of elements of the input dataset
train_ds = train_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(2)))) # see https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0 windows_ds = embeddings_ds.window(2, drop_remainder=True)
# https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0
flat_ds = windows_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(2))))
# generate the target label depending on whether the labels match or not # generate the target label depending on whether the labels match or not
train_ds = train_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False) map_ds = flat_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
# resample to the desired distribution train_ds = map_ds.batch(TRAIN_BATCH_SIZE) # .prefetch(tf.data.AUTOTUNE)
# train_ds = train_ds.rejection_resample(lambda embs, target: tf.cast(target, tf.int32), [0.5, 0.5], initial_dist=[0.9, 0.1])
# train_ds = train_ds.map(lambda _, vals: vals) # discard the prepended "selected" class from the rejction resample, since we aleady have it available
train_ds = train_ds.batch(TRAIN_BATCH_SIZE) # .prefetch(tf.data.AUTOTUNE)
return train_ds return train_ds
...@@ -31,7 +31,6 @@ def cosine_distance(vects): ...@@ -31,7 +31,6 @@ def cosine_distance(vects):
(as floating point value) between vectors. (as floating point value) between vectors.
""" """
# NOTE: Cosine_distance = 1 - cosine_similarity # NOTE: Cosine_distance = 1 - cosine_similarity
# Cosine distance is defined betwen [0,2] where 0 is vectors with the same direction and verse, # Cosine distance is defined between [0,2] where 0 is vectors with the same direction and verse,
# 1 is perpendicular vectors and 2 is opposite vectors # 1 is perpendicular vectors and 2 is opposite vectors
cosine_similarity = layers.Dot(axes=1, normalize=True)(vects) return 1 - layers.Dot(axes=1, normalize=True)(vects)
return 1 - cosine_similarity
import csv import csv
import time import time
import _pickle as pickle import bz2
import pickle
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from tqdm import tqdm from tqdm import tqdm
...@@ -15,12 +16,12 @@ from src.data import AsbDataset ...@@ -15,12 +16,12 @@ from src.data import AsbDataset
def _save_vectors_path(values, labels, path): def _save_vectors_path(values, labels, path):
data = [values, labels] data = [values, labels]
with open(path, 'wb') as outfile: with bz2.BZ2File(path, 'wb') as f:
pickle.dump(data, outfile, -1) pickle.dump(data, f, -1)
def _load_vectors_path(path): def _load_vectors_path(path):
with open(path, 'rb') as infile: with bz2.BZ2File(path, 'rb') as infile:
result = pickle.load(infile) result = pickle.load(infile)
return result[0], result[1] return result[0], result[1]
...@@ -54,7 +55,7 @@ def calc_vectors(ds, model): ...@@ -54,7 +55,7 @@ def calc_vectors(ds, model):
ds_vectors.extend(predictions.numpy().tolist()) ds_vectors.extend(predictions.numpy().tolist())
ds_labels.extend(labels.numpy().tolist()) ds_labels.extend(labels.numpy().tolist())
return np.array(ds_vectors), np.array(ds_labels) return np.array(ds_vectors, dtype='float32'), np.array(ds_labels, dtype='uint8')
def calc_vectors_fn(ds, fn, *args): def calc_vectors_fn(ds, fn, *args):
...@@ -66,7 +67,7 @@ def calc_vectors_fn(ds, fn, *args): ...@@ -66,7 +67,7 @@ def calc_vectors_fn(ds, fn, *args):
ds_vectors.append(vector) ds_vectors.append(vector)
ds_labels.append(label) ds_labels.append(label)
return np.array(ds_vectors), np.array(ds_labels) return np.array(ds_vectors, dtype='float32'), np.array(ds_labels, dtype='uint8')
def evaluate_vectors(values, labels): def evaluate_vectors(values, labels):
...@@ -132,7 +133,7 @@ def load_weights_of(model: tf.keras.Model, dataset: AsbDataset): ...@@ -132,7 +133,7 @@ def load_weights_of(model: tf.keras.Model, dataset: AsbDataset):
def get_embeddings_of(model: tf.keras.Model, dataset: AsbDataset): def get_embeddings_of(model: tf.keras.Model, dataset: AsbDataset):
embedding_file = get_datadir(model.name + '_' + dataset.name + '.pkl') embedding_file = get_datadir(model.name + '_' + dataset.name + '.pbz2')
if Path(embedding_file).exists(): if Path(embedding_file).exists():
return _load_vectors_path(embedding_file) return _load_vectors_path(embedding_file)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment