Commit dee42e94 authored by Oleh Astappiev's avatar Oleh Astappiev
Browse files

chore: apply code format

parent 2d3259f1
...@@ -7,3 +7,4 @@ __pycache__ ...@@ -7,3 +7,4 @@ __pycache__
/datasets /datasets
/models /models
/logs /logs
/logs_*
import sys import sys
sys.path.append("..") sys.path.append("..")
from src.data.embeddings import * from src.data.embeddings import *
......
...@@ -6,6 +6,7 @@ from tensorflow.keras import datasets ...@@ -6,6 +6,7 @@ from tensorflow.keras import datasets
from tensorflow import data from tensorflow import data
import tensorflow as tf import tensorflow as tf
def cifar10_complete(): def cifar10_complete():
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data() (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
images = np.concatenate([train_images, test_images]) images = np.concatenate([train_images, test_images])
...@@ -27,12 +28,13 @@ def shuffle_arrays(arrays, set_seed=-1): ...@@ -27,12 +28,13 @@ def shuffle_arrays(arrays, set_seed=-1):
set_seed : Seed value if int >= 0, else seed is random. set_seed : Seed value if int >= 0, else seed is random.
""" """
assert all(len(arr) == len(arrays[0]) for arr in arrays) assert all(len(arr) == len(arrays[0]) for arr in arrays)
seed = np.random.randint(0, 2**(32 - 1) - 1) if set_seed < 0 else set_seed seed = np.random.randint(0, 2 ** (32 - 1) - 1) if set_seed < 0 else set_seed
for arr in arrays: for arr in arrays:
rstate = np.random.RandomState(seed) rstate = np.random.RandomState(seed)
rstate.shuffle(arr) rstate.shuffle(arr)
def produce_tuples(): def produce_tuples():
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data() (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
...@@ -129,7 +131,8 @@ def load_tuples(): ...@@ -129,7 +131,8 @@ def load_tuples():
def prepare_dataset(): def prepare_dataset():
(anchor_images, anchor_labels), (positive_images, positive_labels), (negative_images, negative_labels) = produce_tuples() (anchor_images, anchor_labels), (positive_images, positive_labels), (
negative_images, negative_labels) = produce_tuples()
anchor_ds = data.Dataset.from_tensor_slices(anchor_images) anchor_ds = data.Dataset.from_tensor_slices(anchor_images)
positive_ds = data.Dataset.from_tensor_slices(positive_images) positive_ds = data.Dataset.from_tensor_slices(positive_images)
......
import sys import sys
sys.path.append("..") sys.path.append("..")
from src.data.embeddings import * from src.data.embeddings import *
...@@ -19,31 +20,33 @@ MODEL_URL = "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_s/feat ...@@ -19,31 +20,33 @@ MODEL_URL = "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_s/feat
MODEL_INPUT_SIZE = [None, 384, 384, 3] MODEL_INPUT_SIZE = [None, 384, 384, 3]
embedding_model = tf.keras.models.Sequential([ embedding_model = tf.keras.models.Sequential([
hub.KerasLayer(MODEL_URL, trainable=False) # EfficientNet V2 S backbone, frozen weights hub.KerasLayer(MODEL_URL, trainable=False) # EfficientNet V2 S backbone, frozen weights
]) ])
embedding_model.build(MODEL_INPUT_SIZE) embedding_model.build(MODEL_INPUT_SIZE)
embedding_model.summary() embedding_model.summary()
#DATASET_NAME = 'cats_vs_dogs' # DATASET_NAME = 'cats_vs_dogs'
DATASET_NAME = 'cifar10' DATASET_NAME = 'cifar10'
#DATASET_NAME = 'cars196' # DATASET_NAME = 'cars196'
ds = tfds.load(DATASET_NAME, split='train') ds = tfds.load(DATASET_NAME, split='train')
# Resize images to the model's input size and normalize to [0.0, 1.0] as per the # Resize images to the model's input size and normalize to [0.0, 1.0] as per the
# expected image input signature: https://www.tensorflow.org/hub/common_signatures/images#input # expected image input signature: https://www.tensorflow.org/hub/common_signatures/images#input
def resize_and_normalize(features): def resize_and_normalize(features):
return { return {
#'id': features['id'], # 'id': features['id'],
'label': features['label'], 'label': features['label'],
'image': tf.image.resize( tf.image.convert_image_dtype(features['image'], tf.float32), MODEL_INPUT_SIZE[1:3]) 'image': tf.image.resize(tf.image.convert_image_dtype(features['image'], tf.float32), MODEL_INPUT_SIZE[1:3])
} }
ds = ds.map(resize_and_normalize, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False) ds = ds.map(resize_and_normalize, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
# Add batch and prefetch to dataset to speed up processing # Add batch and prefetch to dataset to speed up processing
BATCH_SIZE=256 BATCH_SIZE = 256
batched_ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) batched_ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# Dataset has keys "id" (that we ignore), "image" and "label". # Dataset has keys "id" (that we ignore), "image" and "label".
...@@ -54,24 +57,24 @@ batched_ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) ...@@ -54,24 +57,24 @@ batched_ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
DST_FNAME = get_datadir('efficientnet_v2_imagenet1k_s.embeddings.pkl') DST_FNAME = get_datadir('efficientnet_v2_imagenet1k_s.embeddings.pkl')
if Path(DST_FNAME).exists(): if Path(DST_FNAME).exists():
# When you need to use the embeddings, upload the file (or store it on Drive and mount your drive folder in Colab), then run: # When you need to use the embeddings, upload the file (or store it on Drive and mount your drive folder in Colab), then run:
df = pd.read_pickle(DST_FNAME) # adapt the path as needed df = pd.read_pickle(DST_FNAME) # adapt the path as needed
embeddings = np.array(df.embedding.values.tolist()) embeddings = np.array(df.embedding.values.tolist())
labels = df.label.values labels = df.label.values
else: else:
embeddings = [] embeddings = []
labels = [] labels = []
for features_batch in tqdm(batched_ds): for features_batch in tqdm(batched_ds):
embeddings.append(embedding_model(features_batch['image']).numpy()) embeddings.append(embedding_model(features_batch['image']).numpy())
labels.append(features_batch['label'].numpy()) labels.append(features_batch['label'].numpy())
embeddings = np.concatenate(embeddings) embeddings = np.concatenate(embeddings)
labels = np.concatenate(labels) labels = np.concatenate(labels)
# Store the precompued values to disk # Store the precompued values to disk
df = pd.DataFrame({'embedding':embeddings.tolist(),'label':labels}) df = pd.DataFrame({'embedding': embeddings.tolist(), 'label': labels})
df.to_pickle(DST_FNAME) df.to_pickle(DST_FNAME)
# Download the generated file to store the calculated embeddings. # Download the generated file to store the calculated embeddings.
NUM_CLASSES = np.unique(labels).shape[0] NUM_CLASSES = np.unique(labels).shape[0]
...@@ -81,28 +84,33 @@ embeddings_ds = tf.data.Dataset.zip(( ...@@ -81,28 +84,33 @@ embeddings_ds = tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(labels) tf.data.Dataset.from_tensor_slices(labels)
)).cache().shuffle(1000).repeat() )).cache().shuffle(1000).repeat()
@tf.function @tf.function
def make_label_for_pair(embeddings, labels): def make_label_for_pair(embeddings, labels):
#embedding_1, label_1 = tuple_1 # embedding_1, label_1 = tuple_1
#embedding_2, label_2 = tuple_2 # embedding_2, label_2 = tuple_2
return (embeddings[0,:], embeddings[1,:]), tf.cast(labels[0] == labels[1], tf.float32) return (embeddings[0, :], embeddings[1, :]), tf.cast(labels[0] == labels[1], tf.float32)
# because of shuffling, we can take two adjacent tuples as a randomly matched pair # because of shuffling, we can take two adjacent tuples as a randomly matched pair
train_ds = embeddings_ds.window(2, drop_remainder=True) train_ds = embeddings_ds.window(2, drop_remainder=True)
train_ds = train_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(2)))) # see https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0 train_ds = train_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(
2)))) # see https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0
# generate the target label depending on whether the labels match or not # generate the target label depending on whether the labels match or not
train_ds = train_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False) train_ds = train_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
# resample to the desired distribution # resample to the desired distribution
#train_ds = train_ds.rejection_resample(lambda embs, target: tf.cast(target, tf.int32), [0.5, 0.5], initial_dist=[0.9, 0.1]) # train_ds = train_ds.rejection_resample(lambda embs, target: tf.cast(target, tf.int32), [0.5, 0.5], initial_dist=[0.9, 0.1])
#train_ds = train_ds.map(lambda _, vals: vals) # discard the prepended "selected" class from the rejction resample, since we aleady have it available # train_ds = train_ds.map(lambda _, vals: vals) # discard the prepended "selected" class from the rejction resample, since we aleady have it available
## Model hyperparters ## Model hyperparters
EMBEDDING_VECTOR_DIMENSION = 1280 EMBEDDING_VECTOR_DIMENSION = 1280
IMAGE_VECTOR_DIMENSIONS = 128 IMAGE_VECTOR_DIMENSIONS = 128
ACTIVATION_FN = 'tanh' # same as in paper ACTIVATION_FN = 'tanh' # same as in paper
MARGIN = 0.005 MARGIN = 0.005
DST_MODEL_FNAME = get_modeldir('seamese_cifar10_' + str(Path(Path(DST_FNAME).stem).stem) + '_' + str(IMAGE_VECTOR_DIMENSIONS) + '.tf') DST_MODEL_FNAME = get_modeldir(
'seamese_cifar10_' + str(Path(Path(DST_FNAME).stem).stem) + '_' + str(IMAGE_VECTOR_DIMENSIONS) + '.tf')
## These functions are straight from the Keras tutorial linked above ## These functions are straight from the Keras tutorial linked above
...@@ -173,6 +181,7 @@ def loss(margin=1): ...@@ -173,6 +181,7 @@ def loss(margin=1):
return contrastive_loss return contrastive_loss
from tensorflow.keras import layers, Model from tensorflow.keras import layers, Model
emb_input_1 = layers.Input(EMBEDDING_VECTOR_DIMENSION) emb_input_1 = layers.Input(EMBEDDING_VECTOR_DIMENSION)
...@@ -180,7 +189,7 @@ emb_input_2 = layers.Input(EMBEDDING_VECTOR_DIMENSION) ...@@ -180,7 +189,7 @@ emb_input_2 = layers.Input(EMBEDDING_VECTOR_DIMENSION)
# projection model is the one to use for queries (put in a sequence after the embedding-generator model above) # projection model is the one to use for queries (put in a sequence after the embedding-generator model above)
projection_model = tf.keras.models.Sequential([ projection_model = tf.keras.models.Sequential([
layers.Dense(IMAGE_VECTOR_DIMENSIONS, activation=ACTIVATION_FN, input_shape=(EMBEDDING_VECTOR_DIMENSION,)) layers.Dense(IMAGE_VECTOR_DIMENSIONS, activation=ACTIVATION_FN, input_shape=(EMBEDDING_VECTOR_DIMENSION,))
]) ])
v1 = projection_model(emb_input_1) v1 = projection_model(emb_input_1)
...@@ -201,18 +210,18 @@ siamese.compile(loss=loss(margin=MARGIN), optimizer="RMSprop") ...@@ -201,18 +210,18 @@ siamese.compile(loss=loss(margin=MARGIN), optimizer="RMSprop")
siamese.summary() siamese.summary()
callbacks = [ callbacks = [
tf.keras.callbacks.TensorBoard(log_dir='logs', profile_batch=5) tf.keras.callbacks.TensorBoard(log_dir='logs', profile_batch=5)
] ]
# TODO: Would be good to have a validation dataset too. # TODO: Would be good to have a validation dataset too.
ds = train_ds.batch(TRAIN_BATCH_SIZE)#.prefetch(tf.data.AUTOTUNE) ds = train_ds.batch(TRAIN_BATCH_SIZE) # .prefetch(tf.data.AUTOTUNE)
history = siamese.fit( history = siamese.fit(
ds, ds,
epochs=NUM_EPOCHS, epochs=NUM_EPOCHS,
steps_per_epoch=STEPS_PER_EPOCH, steps_per_epoch=STEPS_PER_EPOCH,
callbacks=callbacks, callbacks=callbacks,
class_weight={0:1/NUM_CLASSES, 1:(NUM_CLASSES-1)/NUM_CLASSES} class_weight={0: 1 / NUM_CLASSES, 1: (NUM_CLASSES - 1) / NUM_CLASSES}
) )
# Build full inference model (from image to image vector): # Build full inference model (from image to image vector):
...@@ -250,11 +259,12 @@ def write_embeddings_for_tensorboard(image_vectors: list, labels: list, root_dir ...@@ -250,11 +259,12 @@ def write_embeddings_for_tensorboard(image_vectors: list, labels: list, root_dir
embedding.tensor_path = 'values.tsv' embedding.tensor_path = 'values.tsv'
projector.visualize_embeddings(root_dir, config) projector.visualize_embeddings(root_dir, config)
inference_model = tf.keras.models.load_model(DST_MODEL_FNAME, compile=False) inference_model = tf.keras.models.load_model(DST_MODEL_FNAME, compile=False)
# NUM_SAMPLES_TO_DISPLAY = 10000 # NUM_SAMPLES_TO_DISPLAY = 10000
NUM_SAMPLES_TO_DISPLAY = 3000 NUM_SAMPLES_TO_DISPLAY = 3000
LOG_DIR=Path('logs_efficientnet') LOG_DIR = Path('../logs_efficientnet')
LOG_DIR.mkdir(exist_ok=True, parents=True) LOG_DIR.mkdir(exist_ok=True, parents=True)
val_ds = (tfds.load(DATASET_NAME, split='test') val_ds = (tfds.load(DATASET_NAME, split='test')
...@@ -268,11 +278,11 @@ val_ds = (tfds.load(DATASET_NAME, split='test') ...@@ -268,11 +278,11 @@ val_ds = (tfds.load(DATASET_NAME, split='test')
image_vectors = [] image_vectors = []
labels = [] labels = []
for feats_batch in tqdm(val_ds): for feats_batch in tqdm(val_ds):
ims = feats_batch['image'] ims = feats_batch['image']
lbls = feats_batch['label'].numpy() lbls = feats_batch['label'].numpy()
embs = inference_model(ims).numpy() embs = inference_model(ims).numpy()
image_vectors.extend(embs.tolist()) image_vectors.extend(embs.tolist())
labels.extend(lbls.tolist()) labels.extend(lbls.tolist())
write_embeddings_for_tensorboard(image_vectors, labels, LOG_DIR) write_embeddings_for_tensorboard(image_vectors, labels, LOG_DIR)
...@@ -281,12 +291,12 @@ ds = embeddings_ds.take(NUM_SAMPLES_TO_DISPLAY).batch(BATCH_SIZE).prefetch(tf.da ...@@ -281,12 +291,12 @@ ds = embeddings_ds.take(NUM_SAMPLES_TO_DISPLAY).batch(BATCH_SIZE).prefetch(tf.da
_image_vectors = [] _image_vectors = []
_labels = [] _labels = []
for feats_batch in tqdm(ds): for feats_batch in tqdm(ds):
ims, lbls = feats_batch ims, lbls = feats_batch
ims = ims.numpy() ims = ims.numpy()
lbls = lbls.numpy() lbls = lbls.numpy()
embs = projection_model(ims).numpy() embs = projection_model(ims).numpy()
_image_vectors.extend(embs.tolist()) _image_vectors.extend(embs.tolist())
_labels.extend(lbls.tolist()) _labels.extend(lbls.tolist())
write_embeddings_for_tensorboard(_image_vectors, _labels, LOG_DIR/'train') write_embeddings_for_tensorboard(_image_vectors, _labels, LOG_DIR / 'train')
print('done') print('done')
import sys import sys
sys.path.append("..") sys.path.append("..")
import csv import csv
...@@ -10,6 +11,7 @@ from tensorflow.keras import models ...@@ -10,6 +11,7 @@ from tensorflow.keras import models
cifar10_vds = cifar10_complete_resized() cifar10_vds = cifar10_complete_resized()
def export_hsv(bin0=256, bin1=256, bin2=256): def export_hsv(bin0=256, bin1=256, bin2=256):
header = ['ID', 'Label', 'HSV vector'] header = ['ID', 'Label', 'HSV vector']
with open('../data/hsv_' + str(bin0) + '.csv', 'w', encoding='UTF8', newline='') as f: with open('../data/hsv_' + str(bin0) + '.csv', 'w', encoding='UTF8', newline='') as f:
...@@ -61,6 +63,7 @@ def export_embeddings(): ...@@ -61,6 +63,7 @@ def export_embeddings():
value_str = ','.join(map(str, embeddings[i])) value_str = ','.join(map(str, embeddings[i]))
writer.writerow([i, label_str, value_str]) writer.writerow([i, label_str, value_str])
# HSV # HSV
# export_hsv(170, 171, 171) # 512 # export_hsv(170, 171, 171) # 512
# export_hsv(340, 342, 342) # 1024 # export_hsv(340, 342, 342) # 1024
......
import sys import sys
sys.path.append("..") sys.path.append("..")
from src.utils.sift import * from src.utils.sift import *
from src.data.cifar10 import * from src.data.cifar10 import *
import tensorflow as tf import tensorflow as tf
# Load dataset # Load dataset
cifar10_vds = cifar10_complete() cifar10_vds = cifar10_complete()
def print_resized(dataset): def print_resized(dataset):
plt.figure(figsize=(20, 20)) plt.figure(figsize=(20, 20))
for i, (image, label) in enumerate(dataset.take(3)): for i, (image, label) in enumerate(dataset.take(3)):
...@@ -24,6 +25,7 @@ def print_resized(dataset): ...@@ -24,6 +25,7 @@ def print_resized(dataset):
subplot_image(3, 3, i * 3 + 3, img_tf.numpy(), "TF image") subplot_image(3, 3, i * 3 + 3, img_tf.numpy(), "TF image")
plt.show() plt.show()
print_resized(cifar10_vds) print_resized(cifar10_vds)
# test HSV # test HSV
......
...@@ -4,10 +4,12 @@ from tensorflow.keras import layers, callbacks, datasets, Sequential ...@@ -4,10 +4,12 @@ from tensorflow.keras import layers, callbacks, datasets, Sequential
tensorboard_cb = callbacks.TensorBoard(get_logdir("alexnet/fit")) tensorboard_cb = callbacks.TensorBoard(get_logdir("alexnet/fit"))
class AlexNetModel(Sequential): class AlexNetModel(Sequential):
def __init__(self): def __init__(self):
super(AlexNetModel, self).__init__([ super(AlexNetModel, self).__init__([
layers.Conv2D(filters=96, kernel_size=(11, 11), strides=(4, 4), activation='relu', input_shape=target_shape + (3,)), layers.Conv2D(filters=96, kernel_size=(11, 11), strides=(4, 4), activation='relu',
input_shape=target_shape + (3,)),
layers.BatchNormalization(), layers.BatchNormalization(),
layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2)), layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2)),
...@@ -41,9 +43,11 @@ class AlexNetModel(Sequential): ...@@ -41,9 +43,11 @@ class AlexNetModel(Sequential):
loss='sparse_categorical_crossentropy', loss='sparse_categorical_crossentropy',
metrics=['accuracy'], metrics=['accuracy'],
loss_weights=None, weighted_metrics=None, run_eagerly=None, steps_per_execution=None, **kwargs): loss_weights=None, weighted_metrics=None, run_eagerly=None, steps_per_execution=None, **kwargs):
super().compile(optimizer, loss, metrics, loss_weights, weighted_metrics, run_eagerly, steps_per_execution, **kwargs) super().compile(optimizer, loss, metrics, loss_weights, weighted_metrics, run_eagerly, steps_per_execution,
**kwargs)
def fit(self, x=None, y=None, batch_size=None, epochs=50, verbose='auto', callbacks=[tensorboard_cb], validation_split=0., def fit(self, x=None, y=None, batch_size=None, epochs=50, verbose='auto', callbacks=[tensorboard_cb],
validation_split=0.,
validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0,
steps_per_epoch=None, validation_steps=None, validation_batch_size=None, validation_freq=1, steps_per_epoch=None, validation_steps=None, validation_batch_size=None, validation_freq=1,
max_queue_size=10, workers=1, use_multiprocessing=False): max_queue_size=10, workers=1, use_multiprocessing=False):
...@@ -73,7 +77,11 @@ class AlexNetModel(Sequential): ...@@ -73,7 +77,11 @@ class AlexNetModel(Sequential):
# plot_grid25(test_ds) # plot_grid25(test_ds)
# plot_grid25(validation_ds) # plot_grid25(validation_ds)
train_ds = (train_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32, drop_remainder=True)) train_ds = (train_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32,
test_ds = (test_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32, drop_remainder=True)) drop_remainder=True))
validation_ds = (validation_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32, drop_remainder=True)) test_ds = (test_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32,
drop_remainder=True))
validation_ds = (
validation_ds.map(process_images_couple).shuffle(buffer_size=train_ds_size).batch(batch_size=32,
drop_remainder=True))
return train_ds, test_ds, validation_ds return train_ds, test_ds, validation_ds
import sys import sys
sys.path.append("..") sys.path.append("..")
from utils.common import * from utils.common import *
...@@ -26,7 +27,7 @@ emb_input_2 = layers.Input(EMBEDDING_VECTOR_DIMENSION) ...@@ -26,7 +27,7 @@ emb_input_2 = layers.Input(EMBEDDING_VECTOR_DIMENSION)
# projection model is the one to use for queries (put in a sequence after the embedding-generator model above) # projection model is the one to use for queries (put in a sequence after the embedding-generator model above)
projection_model = tf.keras.models.Sequential([ projection_model = tf.keras.models.Sequential([
layers.Dense(IMAGE_VECTOR_DIMENSIONS, activation='tanh', input_shape=(EMBEDDING_VECTOR_DIMENSION,)) layers.Dense(IMAGE_VECTOR_DIMENSIONS, activation='tanh', input_shape=(EMBEDDING_VECTOR_DIMENSION,))
]) ])
v1 = projection_model(emb_input_1) v1 = projection_model(emb_input_1)
...@@ -52,15 +53,18 @@ embeddings_ds = tf.data.Dataset.zip(( ...@@ -52,15 +53,18 @@ embeddings_ds = tf.data.Dataset.zip((
)) ))
embeddings_ds = embeddings_ds.cache().shuffle(1000).repeat() embeddings_ds = embeddings_ds.cache().shuffle(1000).repeat()
@tf.function @tf.function
def make_label_for_pair(embeddings, labels): def make_label_for_pair(embeddings, labels):
#embedding_1, label_1 = tuple_1 # embedding_1, label_1 = tuple_1
#embedding_2, label_2 = tuple_2 # embedding_2, label_2 = tuple_2
return (embeddings[0,:], embeddings[1,:]), tf.cast(labels[0] == labels[1], tf.float32) return (embeddings[0, :], embeddings[1, :]), tf.cast(labels[0] == labels[1], tf.float32)
# because of shuffling, we can take two adjacent tuples as a randomly matched pair # because of shuffling, we can take two adjacent tuples as a randomly matched pair
train_ds = embeddings_ds.window(2, drop_remainder=True) train_ds = embeddings_ds.window(2, drop_remainder=True)
train_ds = train_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(2)))) # see https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0 train_ds = train_ds.flat_map(lambda w1, w2: tf.data.Dataset.zip((w1.batch(2), w2.batch(
2)))) # see https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0
# generate the target label depending on whether the labels match or not # generate the target label depending on whether the labels match or not
train_ds = train_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False) train_ds = train_ds.map(make_label_for_pair, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
# resample to the desired distribution # resample to the desired distribution
...@@ -80,4 +84,5 @@ embedding = alexnet(im_input) ...@@ -80,4 +84,5 @@ embedding = alexnet(im_input)
image_vector = projection_model(embedding) image_vector = projection_model(embedding)
inference_model = Model(inputs=im_input, outputs=image_vector) inference_model = Model(inputs=im_input, outputs=image_vector)
inference_model.save(get_modeldir('seamese_cifar10_' + str(IMAGE_VECTOR_DIMENSIONS) + '.tf'), save_format='tf', include_optimizer=False) inference_model.save(get_modeldir('seamese_cifar10_' + str(IMAGE_VECTOR_DIMENSIONS) + '.tf'), save_format='tf',
include_optimizer=False)
...@@ -10,21 +10,22 @@ from src.data.cifar10 import * ...@@ -10,21 +10,22 @@ from src.data.cifar10 import *
from src.data.embeddings import * from src.data.embeddings import *
from tensorflow.keras import layers from tensorflow.keras import layers
def write_embeddings_for_tensorboard(image_vectors: list, labels: list , root_dir: Path):
def write_embeddings_for_tensorboard(image_vectors: list, labels: list, root_dir: Path):
import csv import csv
from tensorboard.plugins import projector from tensorboard.plugins import projector
root_dir.mkdir(parents=True, exist_ok=True) root_dir.mkdir(parents=True, exist_ok=True)
with (root_dir/'values.tsv').open('w') as fp: with (root_dir / 'values.tsv').open('w') as fp:
writer = csv.writer(fp,delimiter='\t') writer = csv.writer(fp, delimiter='\t')
writer.writerows(image_vectors) writer.writerows(image_vectors)
with (root_dir/'metadata.tsv').open('w') as fp: with (root_dir / 'metadata.tsv').open('w') as fp:
for lbl in labels: for lbl in labels:
fp.write(f'{lbl}\n') fp.write(f'{lbl}\n')
image_vectors = np.asarray(image_vectors) image_vectors = np.asarray(image_vectors)
embeddings = tf.Variable(image_vectors, name='embeddings') embeddings = tf.Variable(image_vectors, name='embeddings')
CHECKPOINT_FILE = str(root_dir/'model.ckpt') CHECKPOINT_FILE = str(root_dir / 'model.ckpt')
ckpt = tf.train.Checkpoint(embeddings=embeddings) ckpt = tf.train.Checkpoint(embeddings=embeddings)
ckpt.save(CHECKPOINT_FILE) ckpt.save(CHECKPOINT_FILE)
...@@ -35,19 +36,20 @@ def write_embeddings_for_tensorboard(image_vectors: list, labels: list , root_di ...@@ -35,19 +36,20 @@ def write_embeddings_for_tensorboard(image_vectors: list, labels: list , root_di
embedding.tensor_path = 'values.tsv' embedding.tensor_path = 'values.tsv'
projector.visualize_embeddings(root_dir, config) projector.visualize_embeddings(root_dir, config)
inference_model = tf.keras.models.load_model(get_modeldir('seamese_cifar10_512.tf'), compile=False) inference_model = tf.keras.models.load_model(get_modeldir('seamese_cifar10_512.tf'), compile=False)
NUM_SAMPLES_TO_DISPLAY = 10000 NUM_SAMPLES_TO_DISPLAY = 10000
LOG_DIR=Path('../logs') LOG_DIR = Path('../logs')