Tensorflow for Deep Learning Research
Published:
CS 20: Tensorflow for Deep Learning Research by Chip Huyen at Stanford University.
Overview of Tensorflow
To put part of a graph on a specific CPU or GPU:
# Creates a graph.
with tf.device('/gpu:2'):
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='b')
c = tf.multiply(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))
tf.Graph()
g = tf.Graph()
with g.as_default():
x = tf.add(3, 5)
sess = tf.Session(graph=g)
with tf.Session() as sess:
sess.run(x)
Multiple graphs
g1 = tf.get_default_graph()
g2 = tf.Graph()
# add ops to the default graph
with g1.as_default():
a = tf.Constant(3)
# add ops to the user created graph
with g2.as_default():
b = tf.Constant(5)
Operations
Visualize with TensorBoard
import tensorflow as tf
a = tf.constant(2)
b = tf.constant(3)
x = tf.add(a, b)
writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
with tf.Session() as sess:
# writer = tf.summary.FileWriter('./graphs', sess.graph)
print(sess.run(x))
writer.close() # close the writer when you’re done using it
Run TensorBoard
$ python3 [yourprogram].py
$ tensorboard --logdir="./graphs" --port 6006
Variables
s = tf.get_variable("scalar", initializer=tf.constant(2))
m = tf.get_variable("matrix", initializer=tf.constant([[0, 1], [2, 3]]))
W = tf.get_variable("big_matrix", shape=(784, 10), initializer=tf.zeros_initializer())
Initialize variables
# The easiest way is initializing all variables at once:
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Initialize only a subset of variables:
with tf.Session() as sess:
sess.run(tf.variables_initializer([a, b]))
# Initialize a single variable
W = tf.Variable(tf.zeros([784,10]))
with tf.Session() as sess:
sess.run(W.initializer)
Each session maintains its own copy of variables
W = tf.Variable(10)
sess1 = tf.Session()
sess2 = tf.Session()
sess1.run(W.initializer)
sess2.run(W.initializer)
print(sess1.run(W.assign_add(10))) # >> 20
print(sess2.run(W.assign_sub(2))) # >> 8
print(sess1.run(W.assign_add(100))) # >> 120
print(sess2.run(W.assign_sub(50))) # >> -42
sess1.close()
sess2.close()
Placeholders
# create a placeholder for a vector of 3 elements, type tf.float32
a = tf.placeholder(tf.float32, shape=[3])
b = tf.constant([5, 5, 5], tf.float32)
# use the placeholder as you would a constant or a variable
c = a + b # short for tf.add(a, b)
with tf.Session() as sess:
print(sess.run(c, feed_dict={a: [1, 2, 3]}))
# >> [6, 7, 8]
VERY BAD Lazy loading Example
x = tf.Variable(10, name='x')
y = tf.Variable(20, name='y')
writer = tf.summary.FileWriter('./graphs/normal_loading', tf.get_default_graph())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for _ in range(10):
sess.run(tf.add(x, y)) # someone decides to be clever to save one line of code
writer.close()
Linear and Logistic Regression
Implementing Huber loss
$L_{\delta}(y, f(x)) = \begin{cases} \frac{1}{2}(y-f(x))^2, & \mbox{for } |y-f(x)|\le \delta, \newline \delta |y-f(x)| - \frac{1}{2}\delta^2, & \mbox{otherwise}. \end{cases}$
def huber_loss(labels, predictions, delta=14.0):
residual = tf.abs(labels - predictions)
def f1(): return 0.5 * tf.square(residual)
def f2(): return delta * residual - 0.5 * tf.square(delta)
return tf.cond(residual < delta, f1, f2)
tf.data.Dataset
tf.data.Dataset.from_tensor_slices((features, labels))
dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1]))
print(dataset.output_types) # >> (tf.float32, tf.float32)
print(dataset.output_shapes) # >> (TensorShape([]), TensorShape([]))
Create Dataset from files
tf.data.TextLineDataset(filenames)
tf.data.FixedLengthRecordDataset(filenames)
tf.data.TFRecordDataset(filenames)
tf.data.Iterator
iterator = dataset.make_one_shot_iterator()
X, Y = iterator.get_next() # X is the birth rate, Y is the life expectancy
with tf.Session() as sess:
print(sess.run([X, Y])) # >> [1.822, 74.82825]
print(sess.run([X, Y])) # >> [3.869, 70.81949]
print(sess.run([X, Y])) # >> [3.911, 72.15066]
iterator = dataset.make_initializable_iterator()
...
for i in range(100):
sess.run(iterator.initializer)
total_loss = 0
try:
while True:
sess.run([optimizer])
except tf.errors.OutOfRangeError:
pass
Eager execution
Boilerplate
x = tf.placeholder(tf.float32, shape=[1, 1])
m = tf.matmul(x, x)
print(m)
# Tensor("MatMul:0", shape=(1, 1), dtype=float32)
with tf.Session() as sess:
m_out = sess.run(m, feed_dict={x: [[2.]]})
print(m_out)
# [[4.]]
##########
x = [[2.]] # No need for placeholders!
m = tf.matmul(x, x)
print(m) # No sessions!
# tf.Tensor([[4.]], shape=(1, 1), dtype=float32)
Lazy Loading
x = tf.random_uniform([2, 2])
with tf.Session() as sess:
for i in range(x.shape[0]):
for j in range(x.shape[1]):
print(sess.run(x[i, j]))
##########
x = tf.random_uniform([2, 2])
for i in range(x.shape[0]):
for j in range(x.shape[1]):
print(x[i, j])
Tensors Act Like NumPy Arrays
x = tf.constant([1.0, 2.0, 3.0])
# Tensors are backed by NumPy arrays
assert type(x.numpy()) == np.ndarray
squared = np.square(x) # Tensors are compatible with NumPy functions
# Tensors are iterable!
for i in x:
print(i)
Gradients
def square(x):
return x ** 2
grad = tfe.gradients_function(square)
print(square(3.)) # tf.Tensor(9., shape=(), dtype=float32)
print(grad(3.)) # [tf.Tensor(6., shape=(), dtype=float32)]
x = tfe.Variable(2.0)
def loss(y):
return (y - x ** 2) ** 2
grad = tfe.implicit_gradients(loss)
print(loss(7.)) # tf.Tensor(9., shape=(), dtype=float32)
print(grad(7.)) # [(<tf.Tensor: -24.0, shape=(), dtype=float32>, <tf.Variable 'Variable:0', shape=() dtype=float32, numpy=2.0>)]
Variable sharing and managing experiments
Word Embedding
- CBOW: use neighbors to predict center
- Skip-Gram: use center to predict neighbors
Structure TensorFlow model
class SkipGramModel:
""" Build the graph for word2vec model """
def __init__(self, params):
pass
def _import_data(self):
""" Step 1: import data """
pass
def _create_embedding(self):
""" Step 2: define weights. In word2vec, it's actually the weights that we care about """
pass
def _create_loss(self):
""" Step 3 + 4: define the inference + the loss function """
pass
def _create_optimizer(self):
""" Step 5: define optimizer """
pass
Name scope (TensorFlow doesn’t know what nodes should be grouped together, unless you tell it to)
with tf.name_scope(name_of_that_scope):
# declare op_1
# declare op_2
# ...
Variable scope
tf.get_variable(<name>, <shape>, <initializer>)
If a variable with <name> already exists, reuse it
If not, initialize it with <shape> using <initializer>
##########
def fully_connected(x, output_dim, scope):
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
w = tf.get_variable("weights", [x.shape[1], output_dim], initializer=tf.random_normal_initializer())
b = tf.get_variable("biases", [output_dim], initializer=tf.constant_initializer(0.0))
return tf.matmul(x, w) + b
def two_hidden_layers(x):
h1 = fully_connected(x, 50, 'h1')
h2 = fully_connected(h1, 10, 'h2')
with tf.variable_scope('two_layers') as scope:
logits1 = two_hidden_layers(x1)
logits2 = two_hidden_layers(x2)
tf.train.Saver
- Only save variables, not graph
- Checkpoints map variable names to tensors
tf.summary
tf.summary.scalar("loss", self.loss)
tf.summary.histogram("histogram loss", self.loss)
summary_op = tf.summary.merge_all()
saver = tf.train.Saver() # defaults to saving all variables
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
writer = tf.summary.FileWriter('./graphs', sess.graph)
for index in range(10000):
...
loss_batch, _, summary = sess.run([loss, optimizer, summary_op])
writer.add_summary(summary, global_step=index)
if (index + 1) % 1000 == 0:
saver.save(sess, 'checkpoints/skip-gram', index)
Control randomization
tf.reset_default_graph()
tf.set_random_seed(2)
c = tf.random_uniform([], -10, 10)
d = tf.random_uniform([], -10, 10)
with tf.Session() as sess:
print(sess.run(c)) >> -4.007516
print(sess.run(d)) >> -2.9833937
with tf.Session() as sess:
print(sess.run(c)) >> -4.007516
print(sess.run(d)) >> -2.9833937
Introduction to ConvNet && Convnet in TensorFlow && Convolutional Neural Networks
Convolutional networks are tailor-made for computer vision tasks.
They exploit:
- Hierarchical nature of features
- Translation invariance of features
“Understanding” what a convnet learns is non-trivial, but some clever approaches exist.
TFRecord:
- The recommended format for TensorFlow
- Binary file format (a serialized tf.train.Example protobuf object)
- make better use of disk cache
- faster to move around
- can handle data of different types
Convert to TFRecord format
# Step 1: create a writer to write tfrecord to that file
writer = tf.python_io.TFRecordWriter(out_file)
# Step 2: get serialized shape and values of the image
shape, binary_image = get_image_binary(image_file)
# Step 3: create a tf.train.Features object
features = tf.train.Features(feature={'label': _int64_feature(label),
'shape': _bytes_feature(shape),
'image': _bytes_feature(binary_image)})
# Step 4: create a sample containing of features defined above
sample = tf.train.Example(features=features)
# Step 5: write the sample to the tfrecord file
writer.write(sample.SerializeToString())
writer.close()
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
Read TFRecord
dataset = tf.data.TFRecordDataset(tfrecord_files)
dataset = dataset.map(_parse_function)
def _parse_function(tfrecord_serialized):
features={'label': tf.FixedLenFeature([], tf.int64),
'shape': tf.FixedLenFeature([], tf.string),
'image': tf.FixedLenFeature([], tf.string)}
parsed_features = tf.parse_single_example(tfrecord_serialized, features)
return parsed_features['label'], parsed_features['shape'], parsed_features['image']
Variational Auto-Encoders
Recurrent Neural Networks
Construct Cells
cell = tf.nn.rnn_cell.GRUCell(hidden_size)
Stack Multiple Cells
layers = [tf.nn.rnn_cell.GRUCell(size) for size in hidden_sizes]
cells = tf.nn.rnn_cell.MultiRNNCell(layers)
output, out_state = tf.nn.dynamic_rnn(cell, seq, length, initial_state)
Dealing with variable sequence length
# Approach 1:
# Maintain a mask (True for real, False for padded tokens)
full_loss = tf.nn.softmax_cross_entropy_with_logits(preds, labels)
loss = tf.reduce_mean(tf.boolean_mask(full_loss, mask))
# Approach 2:
# Let your model know the real sequence length so it only predict the labels for the real tokens
cell = tf.nn.rnn_cell.GRUCell(hidden_size)
rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)
output, out_state = tf.nn.dynamic_rnn(cell, seq, length, initial_state)
Clip gradients with tf.clip_by_global_norm
gradients = tf.gradients(cost, tf.trainable_variables())
# take gradients of cost w.r.t. all trainable variables
clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm)
# clip the gradients by a pre-defined max norm
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(gradients, trainables))
# add the clipped gradients to the optimizer
Seq2seq with Attention
Beam search: On each step of decoder, keep track of the k most probable partial translations
- k is the beam size (in practice around 5 to 10)
- Not guaranteed to find optimal solution
- But much more efficient
BLEU (Bilingual Evaluation Understudy): BLEU compares the machine-written translation to one or several human-written translation(s), and computes a similarity score based on:
- n-gram precision (usually up to 3 or 4-grams)
- Penalty for too-short system translations
BLEU is useful but imperfect:
- There are many valid ways to translate a sentence
- So a good translation can get a poor BLEU score because it has low n-gram overlap with the human translation
Attention: it provides a solution to the bottleneck problem
core idea: on each step of the decoder, focus on a particular part of the source sequence