Chessboard Convolutional Neural Network classifier¶

Link to Github source code

In the previous notebook we did a 1-layer simple softmax regression classifier, which had ~99% accuracy since we were testing on a cordoned off portion of the entire dataset. This worked well for a majority of reddit posts, but whenever we had a screenshot of a board or piece set that was sufficiently different we'd end up mistaking pawns for bishops etc. We're aiming for some domain adaptation here, where our collected dataset consists of around 9000 tiles from several themes within lichess.org, chess.com and two fen diagram generator sites. But we'd like it to apply to chessboard screenshots of themes or sites we haven't trained for.

As a first step, we'll build a Convolutional Neural Network (CNN) and train it on the same dataset, taking advantage of the fact the spatial information within a tile can provide further insight.

# Init and helper functions
import tensorflow as tf
import numpy as np
import PIL
import urllib, cStringIO
import glob
from IPython.core.display import Markdown
from IPython.display import Image, display

import helper_functions as hf
import tensorflow_chessbot

np.set_printoptions(precision=2, suppress=True)

Let's load the tiles in for the training and test dataset, and then split them in a 90/10 ratio

# All tiles with pieces in random organizations
all_paths = np.array(glob.glob("tiles/train_tiles_C/*/*.png")) # TODO : (set labels correctly)

# Shuffle order of paths so when we split the train/test sets the order of files doesn't affect it
np.random.shuffle(all_paths)

ratio = 0.9 # training / testing ratio
divider = int(len(all_paths) * ratio)
train_paths = all_paths[:divider]
test_paths = all_paths[divider:]

# Training dataset
# Generated by programmatic screenshots of lichess.org/editor/<FEN-string>
print "Loading %d Training tiles" % train_paths.size
train_images, train_labels = hf.loadFENtiles(train_paths) # Load from generated set

# Test dataset, taken from screenshots of the starting position
print "Loading %d Training tiles" % test_paths.size
test_images, test_labels = hf.loadFENtiles(test_paths) # Load from generated set

train_dataset = hf.DataSet(train_images, train_labels, dtype=tf.float32)
test_dataset = hf.DataSet(test_images, test_labels, dtype=tf.float32)

Loading 8294 Training tiles
. . . . . . . . . Done
Loading 922 Training tiles
. Done

Looks good. Now that we've loaded the data, let's build up a deep CNN classifier based off of this beginner tutorial on tensorflow.

print "Setting up CNN..."
def weight_variable(shape, name=""):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name)

def bias_variable(shape, name=""):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x, name=""):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME', name=name)

x = tf.placeholder(tf.float32, [None, 32*32])

# First layer : 32 features
W_conv1 = weight_variable([5, 5, 1, 32], name='W1')
b_conv1 = bias_variable([32], name='B1')

x_image = tf.reshape(x, [-1,32,32,1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1, name='Conv1')
h_pool1 = max_pool_2x2(h_conv1, name='Pool1')

# Second convolutional layer : 64 features
W_conv2 = weight_variable([5, 5, 32, 64], name='W2')
b_conv2 = bias_variable([64], name='B2')

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='Conv2')
h_pool2 = max_pool_2x2(h_conv2, name='Pool2')

# Densely connected layer : 1024 neurons, image size now 8x8
W_fc1 = weight_variable([8 * 8 * 64, 1024], name='W3')
b_fc1 = bias_variable([1024], name='B3')

h_pool2_flat = tf.reshape(h_pool2, [-1, 8*8*64], name='Pool3')
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1, 'MatMult3')

# Dropout
keep_prob = tf.placeholder("float", name='KeepProb')
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name='Drop4')

# Readout layer : softmax, 13 features
W_fc2 = weight_variable([1024, 13], name='W5')
b_fc2 = bias_variable([13], name='B5')

y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='Ypredict')

# # Old single layer regression classifier
# W = tf.Variable(tf.zeros([32*32, 13]))
# b = tf.Variable(tf.zeros([13]))
# y = tf.nn.softmax(tf.matmul(x, W) + b)

# Ground truth labels if exist 
y_ = tf.placeholder(tf.float32, [None, 13], name='Ytruth')

cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv), name='CrossEntropy')

# train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1), name='CorrectPrediction')
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name='Accuracy')

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Start Interactive session for rest of notebook (else we'd want to close session)
sess = tf.InteractiveSession()

# Training vs loading existing model
do_training = False

# Number of steps
N = 10000

if do_training:
    #Initialize session
    sess.run(tf.initialize_all_variables())
    
    # Training
    print "Training for %d steps..." % N
    for i in range(N):
        # Get next batch for training
        batch_xs, batch_ys = train_dataset.next_batch(100)

        # Print out progress to screen
        if ((i+1) % 100) == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x:batch_xs, y_: batch_ys, keep_prob: 1.0})
            print "\n\t%d/%d, training accuracy %g" % (i+1, N, train_accuracy),
        elif ((i+1) % 10) == 0:
            print '.',

        # Train model with batch
        sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 0.5})

    print "Finished training."

    # Save model checkpoint
    save_path = saver.save(sess, "saved_models/model_%d.ckpt" % N)
    print "Model saved in file: ", save_path

else:
    # Restore model from checkpoint
    model_name = "saved_models/model_%d.ckpt" % N
    print "Loading model '%s'" % model_name
    saver.restore(sess, model_name)
    print "Model restored."
    
# Testing
print "Accuracy: %g\n" % accuracy.eval(feed_dict={x: test_dataset.images,
                                                  y_: test_dataset.labels,
                                                  keep_prob: 1.0})

Setting up CNN...
Training for 10000 steps...
. . . . . . . . . 
	100/10000, training accuracy 0.97 . . . . . . . . . 
	200/10000, training accuracy 0.98 . . . . . . . . . 
	300/10000, training accuracy 0.99 . . . . . . . . . 
	400/10000, training accuracy 0.97 . . . . . . . . . 
	500/10000, training accuracy 1 . . . . . . . . . 
	600/10000, training accuracy 0.98 . . . . . . . . . 
	700/10000, training accuracy 0.99 . . . . . . . . . 
	800/10000, training accuracy 1 . . . . . . . . . 
	900/10000, training accuracy 0.99 . . . . . . . . . 
	1000/10000, training accuracy 0.99 . . . . . . . . . 
	1100/10000, training accuracy 1 . . . . . . . . . 
	1200/10000, training accuracy 0.99 . . . . . . . . . 
	1300/10000, training accuracy 1 . . . . . . . . . 
	1400/10000, training accuracy 1 . . . . . . . . . 
	1500/10000, training accuracy 1 . . . . . . . . . 
	1600/10000, training accuracy 1 . . . . . . . . . 
	1700/10000, training accuracy 1 . . . . . . . . . 
	1800/10000, training accuracy 1 . . . . . . . . . 
	1900/10000, training accuracy 1 . . . . . . . . . 
	2000/10000, training accuracy 1 . . . . . . . . . 
	2100/10000, training accuracy 1 . . . . . . . . . 
	2200/10000, training accuracy 1 . . . . . . . . . 
	2300/10000, training accuracy 1 . . . . . . . . . 
	2400/10000, training accuracy 1 . . . . . . . . . 
	2500/10000, training accuracy 1 . . . . . . . . . 
	2600/10000, training accuracy 1 . . . . . . . . . 
	2700/10000, training accuracy 1 . . . . . . . . . 
	2800/10000, training accuracy 1 . . . . . . . . . 
	2900/10000, training accuracy 1 . . . . . . . . . 
	3000/10000, training accuracy 1 . . . . . . . . . 
	3100/10000, training accuracy 1 . . . . . . . . . 
	3200/10000, training accuracy 1 . . . . . . . . . 
	3300/10000, training accuracy 1 . . . . . . . . . 
	3400/10000, training accuracy 1 . . . . . . . . . 
	3500/10000, training accuracy 1 . . . . . . . . . 
	3600/10000, training accuracy 1 . . . . . . . . . 
	3700/10000, training accuracy 1 . . . . . . . . . 
	3800/10000, training accuracy 1 . . . . . . . . . 
	3900/10000, training accuracy 1 . . . . . . . . . 
	4000/10000, training accuracy 1 . . . . . . . . . 
	4100/10000, training accuracy 1 . . . . . . . . . 
	4200/10000, training accuracy 1 . . . . . . . . . 
	4300/10000, training accuracy 1 . . . . . . . . . 
	4400/10000, training accuracy 1 . . . . . . . . . 
	4500/10000, training accuracy 1 . . . . . . . . . 
	4600/10000, training accuracy 1 . . . . . . . . . 
	4700/10000, training accuracy 1 . . . . . . . . . 
	4800/10000, training accuracy 1 . . . . . . . . . 
	4900/10000, training accuracy 1 . . . . . . . . . 
	5000/10000, training accuracy 1 . . . . . . . . . 
	5100/10000, training accuracy 1 . . . . . . . . . 
	5200/10000, training accuracy 1 . . . . . . . . . 
	5300/10000, training accuracy 1 . . . . . . . . . 
	5400/10000, training accuracy 1 . . . . . . . . . 
	5500/10000, training accuracy 1 . . . . . . . . . 
	5600/10000, training accuracy 1 . . . . . . . . . 
	5700/10000, training accuracy 1 . . . . . . . . . 
	5800/10000, training accuracy 1 . . . . . . . . . 
	5900/10000, training accuracy 1 . . . . . . . . . 
	6000/10000, training accuracy 1 . . . . . . . . . 
	6100/10000, training accuracy 1 . . . . . . . . . 
	6200/10000, training accuracy 1 . . . . . . . . . 
	6300/10000, training accuracy 1 . . . . . . . . . 
	6400/10000, training accuracy 1 . . . . . . . . . 
	6500/10000, training accuracy 1 . . . . . . . . . 
	6600/10000, training accuracy 1 . . . . . . . . . 
	6700/10000, training accuracy 1 . . . . . . . . . 
	6800/10000, training accuracy 1 . . . . . . . . . 
	6900/10000, training accuracy 1 . . . . . . . . . 
	7000/10000, training accuracy 1 . . . . . . . . . 
	7100/10000, training accuracy 1 . . . . . . . . . 
	7200/10000, training accuracy 1 . . . . . . . . . 
	7300/10000, training accuracy 1 . . . . . . . . . 
	7400/10000, training accuracy 1 . . . . . . . . . 
	7500/10000, training accuracy 1 . . . . . . . . . 
	7600/10000, training accuracy 1 . . . . . . . . . 
	7700/10000, training accuracy 1 . . . . . . . . . 
	7800/10000, training accuracy 1 . . . . . . . . . 
	7900/10000, training accuracy 1 . . . . . . . . . 
	8000/10000, training accuracy 1 . . . . . . . . . 
	8100/10000, training accuracy 1 . . . . . . . . . 
	8200/10000, training accuracy 1 . . . . . . . . . 
	8300/10000, training accuracy 1 . . . . . . . . . 
	8400/10000, training accuracy 1 . . . . . . . . . 
	8500/10000, training accuracy 1 . . . . . . . . . 
	8600/10000, training accuracy 1 . . . . . . . . . 
	8700/10000, training accuracy 1 . . . . . . . . . 
	8800/10000, training accuracy 1 . . . . . . . . . 
	8900/10000, training accuracy 1 . . . . . . . . . 
	9000/10000, training accuracy 1 . . . . . . . . . 
	9100/10000, training accuracy 1 . . . . . . . . . 
	9200/10000, training accuracy 1 . . . . . . . . . 
	9300/10000, training accuracy 1 . . . . . . . . . 
	9400/10000, training accuracy 1 . . . . . . . . . 
	9500/10000, training accuracy 1 . . . . . . . . . 
	9600/10000, training accuracy 1 . . . . . . . . . 
	9700/10000, training accuracy 1 . . . . . . . . . 
	9800/10000, training accuracy 1 . . . . . . . . . 
	9900/10000, training accuracy 1 . . . . . . . . . 
	10000/10000, training accuracy 1 Finished training.
Model saved in file:  saved_models/model_10000.ckpt
Accuracy: 1

Let's have a look at the failure cases to get a sense of any mistakes

mistakes = tf.where(~correct_prediction)
mistake_indices = sess.run(mistakes, feed_dict={x: test_dataset.images,
                                                y_: test_dataset.labels,
                                               keep_prob: 1.0}).flatten()

guess_prob, guessed = sess.run([y_conv, tf.argmax(y_conv,1)], feed_dict={x: test_dataset.images, keep_prob: 1.0})


if mistake_indices.size > 0:
    print "%d mistakes:" % mistake_indices.size
    for idx in np.random.choice(mistake_indices, 5, replace=False):
        a,b = test_dataset.labels[idx], guessed[idx]
        print "---"
        print "\t#%d | Actual: '%s', Guessed: '%s'" % (idx, hf.label2Name(a),hf.labelIndex2Name(b))
        print "Actual:",a
        print " Guess:",guess_prob[idx,:]
        hf.display_array(np.reshape(test_dataset.images[idx,:],[32,32]))
else:
    print "%d mistakes" % mistake_indices.size

0 mistakes

It looks like it's been learning that pieces have black borders, and since this pieceSet didn't, and it was a small part of the training set, it just fails and thinks we're looking at blank squares, more training data! From the label probabilities, it did a reasonable job of thinking the pieces were white, and their second best guesses tended to be close to the right answer, the blank spaces just won out.

Also, lets look at several random selections, including successes.

for idx in np.random.choice(test_dataset.num_examples,5,replace=False):
    a,b = test_dataset.labels[idx], guessed[idx]
    print "#%d | Actual: '%s', Guessed: '%s'" % (idx, hf.label2Name(a),hf.labelIndex2Name(b))
    hf.display_array(np.reshape(test_dataset.images[idx,:],[32,32]))

#619 | Actual: ' ', Guessed: ' '

#526 | Actual: 'b', Guessed: 'b'

#133 | Actual: 'P', Guessed: 'P'

#165 | Actual: 'N', Guessed: 'N'

#712 | Actual: 'R', Guessed: 'R'

Predict from image url¶

Let's wrap up predictions into a single function call from a URL, and test it on a few reddit posts.

def getPrediction(img):
    """Run trained neural network on tiles generated from image"""
    
    # Convert to grayscale numpy array
    img_arr = np.asarray(img.convert("L"), dtype=np.float32)
    
    # Use computer vision to get the tiles
    tiles = tensorflow_chessbot.getTiles(img_arr)
    if tiles is None or len(tiles) == 0:
        print "Couldn't parse chessboard"
        return None, 0.0
    
    # Reshape into Nx1024 rows of input data, format used by neural network
    validation_set = np.swapaxes(np.reshape(tiles, [32*32, 64]),0,1)

    # Run neural network on data
    guess_prob, guessed = sess.run([y_conv, tf.argmax(y_conv,1)], feed_dict={x: validation_set, keep_prob: 1.0})
    
    # Prediction bounds
    a = np.array(map(lambda x: x[0][x[1]], zip(guess_prob, guessed)))
    print "Certainty range [%g - %g], Avg: %g" % (a.min(), a.max(), a.mean())
    
    # Convert guess into FEN string
    # guessed is tiles A1-H8 rank-order, so to make a FEN we just need to flip the files from 1-8 to 8-1
    pieceNames = map(lambda k: '1' if k == 0 else hf.labelIndex2Name(k), guessed) # exchange ' ' for '1' for FEN
    fen = '/'.join([''.join(pieceNames[i*8:(i+1)*8]) for i in reversed(range(8))])
    return fen, a.prod()

def makePrediction(image_url):
    """Given image url to a chessboard image, display a visualization of FEN and link to a lichess analysis
       Return minimum certainty for prediction."""
    # Load image from url and display
    success = True
    try:
        img = PIL.Image.open(cStringIO.StringIO(urllib.urlopen(image_url).read()))
    except IOError, e:
        success = False
    if not success:
        try:
            img = PIL.Image.open(cStringIO.StringIO(urllib.urlopen(image_url+'.png').read()))
            success = True
        except IOError, e:
            success = False
    if not success:
        try:
            img = PIL.Image.open(cStringIO.StringIO(urllib.urlopen(image_url+'.jpg').read()))
            success = True
        except IOError, e:
            success = False
    if not success:
        try:
            img = PIL.Image.open(cStringIO.StringIO(urllib.urlopen(image_url+'.gif').read()))
            success = True
        except IOError, e:
            success = False

    if not success:
        print "Couldn't load image url: %s" % image_url
        return 0.0 # certainty
    
    print "Image on which to make prediction: %s" % image_url
    ratio = 250.0 / img.size[1]
    hf.display_image(img.resize([int(img.size[0] * ratio), 250], PIL.Image.ADAPTIVE))
    
    # Make prediction
    fen, certainty = getPrediction(img)
    if fen:
        display(Markdown("Prediction: [Lichess analysis](http://www.lichess.org/analysis/%s)" % hf.shortenFEN(fen)))
        display(Image(url='http://www.fen-to-image.com/image/30/%s' % fen))
        print "FEN: %s" % hf.shortenFEN(fen)
    return certainty

Make Predictions¶

All the boilerplate is done, the model is trained, it's time. I chose the first post I saw on reddit.com/chess with a chessboard (something our CV algorithm can do also): https://www.reddit.com/r/chess/comments/45inab/moderate_black_to_play_and_win/ with an image url of http://i.imgur.com/x6lLQQK.png

And awaayyy we gooo...

makePrediction('http://i.imgur.com/x6lLQQK.png')

Image on which to make prediction: http://i.imgur.com/x6lLQQK.png

Certainty range [0.999977 - 1], Avg: 0.999997

FEN: KQ3B2/P2bN1P1/2P3R1/b2P4/3p1P2/8/pp6/1kq1r3

0.99977851

Fantastic, a perfect match! It was able to handle the highlighting on the pawn movement from G2 to F3 also.

Now just for fun, let's try an image that is from a chessboard we've never seen before! Here's another on reddit: https://www.reddit.com/r/chess/comments/45c8ty/is_this_position_starting_move_36_a_win_for_white/

makePrediction('http://i.imgur.com/r2r43xA.png')

Image on which to make prediction: http://i.imgur.com/r2r43xA.png

Certainty range [0.645451 - 1], Avg: 0.980475

FEN: 8/4B3/bBK2Nr1/8/2b1B1B1/p2k4/1p3p2/8

0.22809464

Hah, it thought some of the pawns were bishops. But it predicted all the other pieces and empty squares correctly despite being a chessboard screenshot from a site we haven't collected data on! This is pretty great, let's look at a few more screenshots taken lichess. Here's https://www.reddit.com/r/chess/comments/44q2n6/tactic_from_a_game_i_just_played_white_to_move/

makePrediction('http://i.imgur.com/gSFbM1d.png')

Image on which to make prediction: http://i.imgur.com/gSFbM1d.png

Certainty range [0.999988 - 1], Avg: 0.999998

FEN: 6k1/5rp1/7p/1pp1Pr2/p2pKPR1/1P4R1/P4P2/8

0.99985528

Yep, it looks like it does well when the validation data is similar to what we trained for, who would have thought. When the validation images are based off of what the model trains, it'll do great, but if we use images from chess boards we haven't trained on, we'll see lots of mistakes. Mistakes are fun, lets see some.

makePrediction('http://imgur.com/oXpMSQI.png')

Image on which to make prediction: http://imgur.com/oXpMSQI.png

Certainty range [0.998874 - 1], Avg: 0.99992

FEN: 2kr3r/p1p2ppp/2pb4/3N1q2/4n1b1/4BN2/PPP1QPPP/2KR3R

0.99487221

makePrediction('http://imgur.com/qk5xa6q.png')

Image on which to make prediction: http://imgur.com/qk5xa6q.png

Certainty range [0.527064 - 1], Avg: 0.990506

FEN: 1KR2B1R/PP3PP1/2P3P1/2pn2pn/3Qp3/r3bB1p/pp2b3/1k2q1r1

0.45862412

makePrediction('http://imgur.com/u4zF5Hj.png')

Image on which to make prediction: http://imgur.com/u4zF5Hj.png

Certainty range [0.999545 - 1], Avg: 0.999977

FEN: 8/5p2/5k1P/2p4P/1p1p4/8/3K4/8

0.99854553

makePrediction('http://imgur.com/CW675pw.png')

Image on which to make prediction: http://imgur.com/CW675pw.png

Certainty range [0.929552 - 1], Avg: 0.998573

FEN: 3r1rk1/ppp1q1pp/1nn1pb2/5b2/2PP4/2N1BN2/PP1QB1PP/3R1RK1

0.91016573

makePrediction('https://i.ytimg.com/vi/pG1Uhw3pO8o/hqdefault.jpg')

Image on which to make prediction: https://i.ytimg.com/vi/pG1Uhw3pO8o/hqdefault.jpg

Certainty range [0.983912 - 1], Avg: 0.999481

FEN: r1bqnr2/pp1ppkbp/4N1p1/n3P3/8/2N1B3/PPP2PPP/R2QK2R

0.96714765

makePrediction('http://www.caissa.com/chess-openings/img/siciliandefense1.gif')

Image on which to make prediction: http://www.caissa.com/chess-openings/img/siciliandefense1.gif

Certainty range [0.554627 - 1], Avg: 0.970189

FEN: rnbqkbnr/pp1ppppp/8/2p5/4P3/8/PBPB1BPB/RNBQKBNR

0.082962133

makePrediction('http://www.jinchess.com/chessboard/?p=rnbqkbnrpPpppppp----------P----------------R----PP-PPPPPRNBQKBNR')

Image on which to make prediction: http://www.jinchess.com/chessboard/?p=rnbqkbnrpPpppppp----------P----------------R----PP-PPPPPRNBQKBNR

Certainty range [0.999996 - 1], Avg: 0.999998

FEN: rnbqkbnr/pPpppppp/8/2P5/8/3R4/PP1PPPPP/RNBQKBNR

0.99989653

Interesting, it doesn't look a CNN solved all of our problems, it comes back to getting better datasets. We need to find a way to programmatically collect more of the piece sets of chess.com, lichess.org and other sites to help round it out. The model is beginning to understand the concept of pieces, and did a valiant effort with boards outside of it's domain, with more data it should get to the point where it will be more useful than not on the chess subreddit.

Validating with last 100 reddit posts¶

Okay, I started a basic reddit bot that pulled the 100 most recent posts on the r/chess subreddit, and only chose those that potentially had a chessboard image and the words white or black in the title, signifying white or black to play. Let's test our predictions on the urls.

reddit_urls = [u'http://imgur.com/GRcKdds',
 u'http://imgur.com/I7cgJO0',
 u'http://imgur.com/albpHvw',
 u'http://imgur.com/337yNGL',
 u'http://i.imgur.com/WcKpzN2.jpg',
 u'http://i.imgur.com/PmALkwI.png',
 u'http://imgur.com/YPmOUCU',
 u'http://i.imgur.com/Xb01wTO.png',
 u'http://imgur.com/CzdxVkB',
 u'http://imgur.com/14PMpto',
 u'http://imgur.com/i5qKESq',
 u'http://imgur.com/95XC1J5',
 u'http://i.imgur.com/XBkHk26.png',
 u'http://imgur.com/4qL270K',
 u'http://i.imgur.com/FPnkfJO.png',
 u'http://imgur.com/ut6RKyl',
 u'http://imgur.com/qtXuMkR',
 u'http://i.imgur.com/yRBJHc7.png',
 u'http://imgur.com/b9zxOOd',
 u'http://imgur.com/SeJasRQ',
 u'http://i.imgur.com/FTjNkP5.png',
 u'https://i.imgur.com/M13bNGb.png',
 u'http://imgur.com/x0XzwJh',
 u'http://imgur.com/u7D5Fkc',
 u'http://imgur.com/BUqCNsI',
 u'http://i.imgur.com/ZGRgL16.jpg',
 u'http://imgur.com/63rBqFR',
 u'http://imgur.com/evDUNw8',
 u'http://imgur.com/Mz4ynW6',
 u'http://imgur.com/J0VzskZ',
 u'http://i.imgur.com/KMSYQKk.png',
 u'http://imgur.com/4oWNIa0',
 u'http://i.imgur.com/BuAs7zT.png',
 u'http://i.imgur.com/OsFNmIA.png',
 u'http://imgur.com/iTEr7aT',
 u'http://i.imgur.com/DxJLdC9.png',
 u'http://imgur.com/YI0xoaV',
 u'http://i.imgur.com/9WxZgtf.png',
 u'http://imgur.com/lJLsGU0',
 u'http://i.imgur.com/Shr4bwr.jpg',
 u'http://imgur.com/L25DgOj',
 u'http://imgur.com/fMIzftn',
 u'http://imgur.com/g7XiYrH',
 u'http://i.imgur.com/MLPHSKo.jpg',
 u'http://imgur.com/b5EMIDK',
 u'http://imgur.com/Ym0w7dw',
 u'http://m.imgur.com/a/A6nWF',
 u'http://imgur.com/lFgeyxi',
 u'http://imgur.com/h4cn4KE',
 u'http://imgur.com/b5XQ1uJ',
 u'http://imgur.com/gInXR9K',
 u'https://imgur.com/A3KmcDG',
 u'http://imgur.com/mTCtcel',
 u'http://imgur.com/o96Rtfn',
 u'http://imgur.com/yIKiRN7',
 u'http://imgur.com/g7IYvwI',
 u'http://i.imgur.com/EMHtHay.png',
 u'http://i.imgur.com/aL64q8w.png',
 u'http://imgur.com/FtcZA47',
 u'http://i.imgur.com/wrXjbe8.png',
 u'http://imgur.com/u4zF5Hj',
 u'http://i.imgur.com/gSFbM1d.png',
 u'http://i.imgur.com/TeHm97Z.jpg',
 u'http://imgur.com/dZDSzAa',
 u'http://i.imgur.com/taNJN7h.png',
 u'http://imgur.com/qk5xa6q',
 u'http://imgur.com/oXpMSQI',
 u'http://imgur.com/r2r43xA',
 u'http://i.imgur.com/x6lLQQK.png',
 u'http://imgur.com/bkn5nn4',
 u'http://i.imgur.com/HnWYt8A.png']

probs = np.zeros(len(reddit_urls))
for i, validate_url in enumerate(reddit_urls):
    print "---"
    print "#%d URL: %s" % (i, validate_url)
    probs[i] = makePrediction(validate_url)
    print

---
#0 URL: http://imgur.com/GRcKdds
Image on which to make prediction: http://imgur.com/GRcKdds

Certainty range [0.996066 - 1], Avg: 0.999937

FEN: 1r3b1r/p4k2/1p2pP1p/q4N2/2pP2Q1/4R1P1/P1P2PKP/R7

---
#1 URL: http://imgur.com/I7cgJO0
Image on which to make prediction: http://imgur.com/I7cgJO0

Certainty range [0.999996 - 1], Avg: 0.999998

FEN: r1bqr1k1/pp1nbpp1/2p4p/3p3n/3P1B2/2NBP3/PPQ1NPPP/R4RK1

---
#2 URL: http://imgur.com/albpHvw
Image on which to make prediction: http://imgur.com/albpHvw

Certainty range [0.999996 - 1], Avg: 0.999998

FEN: r1b2r2/pp3pk1/1qn1p1p1/2p4p/4BN1P/2PP2P1/PP1Q1P2/R3K2R

---
#3 URL: http://imgur.com/337yNGL
Image on which to make prediction: http://imgur.com/337yNGL

Certainty range [0.999968 - 1], Avg: 0.999994

FEN: r6b/3qp1Rp/p2p1k1B/n1pP1r2/4R1QP/5PP1/5P1K/q7

---
#4 URL: http://i.imgur.com/WcKpzN2.jpg
Image on which to make prediction: http://i.imgur.com/WcKpzN2.jpg

Certainty range [0.99998 - 1], Avg: 0.999996

%matplotlib inline
import matplotlib.pyplot as plt
for i in [0.999, 0.99, 0.98, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5]:
    print "%d/%d with certainties under %g%%" % (np.sum(probs < i), probs.size, (i*100))

plt.bar(np.arange(len(probs)), probs*100)
plt.xlabel('Screenshot #')
plt.ylabel('Certainty Percent (%)')
plt.title('Certainty of each screenshot prediction')
plt.xlim(0,len(probs));

21/71 with certainties under 99.9%
16/71 with certainties under 99%
15/71 with certainties under 98%
12/71 with certainties under 95%
12/71 with certainties under 90%
12/71 with certainties under 80%
12/71 with certainties under 70%
12/71 with certainties under 60%
12/71 with certainties under 50%

A handful of failures, but we have greater than 98% success for 78% of screenshots, including several out-of-left-field screenshots from mobile chess apps. Certainty is defined as the product of all 64 tile probabilities together, which is a bit stricter than minimum certainty, but shows overall certainty for the board better.

Say two pieces had 90% probability of correctness (the rest are 100%), then the overall certainty for the board should be lower than 90%, 81% in that case.

Looking at the failure cases, it looks like the images from lichess or chess and other more common screenshots were good, the others had a couple to several wrong pieces within. On the whole it actually got most of them correct, and when it didn't the certainty dropped extremely quickly. The certainty range does a good job of being uncertain in the cases where it failed, and 98%+ certain for the success cases.

Time to make a reddit bot.