%%html
<script src="https://bits.csb.pitt.edu/preamble.js"></script>

%%html
<div id="nand" style="width: 500px"></div>
<script>

    var divid = '#nand';
	jQuery(divid).asker({
	    id: divid,
	    question: "What are the corresponding outputs for x = [0,0],[0,1],[1,0], and [1,1]?",
		answers: ["0,0,0,0","0,1,1,0","0,0,0,1","0,1,1,1","1,1,1,0"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

plt.plot(x, x > 0,linewidth=1,clip_on=False);
plt.hlines(xmin=-10,xmax=0,y=0,linewidth=3,color='b')
plt.hlines(xmin=0,xmax=10,y=1,linewidth=3,color='b');

plt.plot(x, 1/(1+np.exp(-x)),linewidth=4,clip_on=False);

plt.plot(x, 1/(1+np.exp(-2*x)),linewidth=2,clip_on=False);
plt.plot(x, 1/(1+np.exp(-.5*x)),linewidth=2,clip_on=False);

plt.plot([-10,10],[0,0],'k--')
plt.plot(x, np.tanh(x),linewidth=4,clip_on=False);

plt.plot(x,x*(x > 0),clip_on=False,linewidth=4);

%%html
<div id="bpopt" style="width: 500px"></div>
<script>
$('head').append('<link rel="stylesheet" href="https://bits.csb.pitt.edu/asker.js/themes/asker.default.css" />');

    var divid = '#bpopt';
	jQuery(divid).asker({
	    id: divid,
	    question: "Will SGD (eventually) converge to a global optimum?",
		answers: ["Yes","No","Depends"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

def softmax(z):
    return np.exp(z)/np.sum(np.exp(z))

softmax([0,0,1])

array([0.21194156, 0.21194156, 0.57611688])

softmax([10,0,-2])

array([9.99948459e-01, 4.53975898e-05, 6.14389567e-06])

%%html
<div id="moment" style="width: 500px"></div>
<script>

    var divid = '#moment';
	jQuery(divid).asker({
	    id: divid,
	    question: "For what value of mu does the momentum update rule reduce to SGD?",
		answers: ["0","1","0.5","None of the above"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

from numpy import exp,arange
from pylab import meshgrid,cm,imshow,contour,clabel,colorbar,axis,title,show
import seaborn as sns
 
def plotf(f):
    plt.figure(figsize=(5,5))
    x = arange(-2.0,2.0,0.1)
    y = arange(-2.0,2.0,0.1)
    X,Y = meshgrid(x, y) # grid of point
    Z = f(X, Y) # evaluation of the function on the grid

    im = imshow(Z,origin='lower',extent=[-2,2,-2,2])
    cset = contour(Z,arange(0,2,0.5),linewidths=2,cmap=cm.Set2,extent=[-2,2,-2,2])
    clabel(cset,inline=True,fmt='%1.1f',fontsize=10)
    colorbar(im) # adding the colobar on the right

    
def plotpt(pt,grad):
    ax = plt.gca()
    plt.plot([pt[0]],[pt[1]],'o',color='white')
    d = np.array(pt)+np.array(grad)
    ax.annotate("",xy=d,xycoords='data',xytext=pt, textcoords='data', annotation_clip=False,
               arrowprops={'arrowstyle':'->','color':'white'})

def f(x,y):
    return x**2+np.abs(0.1*y)

def g(x,y):
    return np.array([2*x,np.sign(y)*0.1])

plotf(f)
plotpt((-1,1),-g(-1,1));

def plot_grad_descent(eta):
    pt = np.array([-1,1])
    plotf(f)
    for _ in range(10):
        plotpt(pt, -eta*g(*pt))
        pt = pt - eta*g(*pt)
    plt.title("Gradient Descent ($\eta$=%.2f)"%eta)
plot_grad_descent(0.6)

def plot_cm(eta,mu):
    v = np.zeros(2); pt = np.array([-1,1]); plotf(f)
    for _ in range(10):
        v = mu*v - eta*g(*pt)
        plotpt(pt, v)
        pt = pt + v
    plt.title("Classical Momentum ($\eta$=%.2f, $\mu$=%.2f)"%(eta,mu))
plot_cm(0.6,0.9)

def plot_nesterov(eta,mu):
    v = np.zeros(2); pt = np.array([-1,1]); plotf(f)
    for _ in range(10):
        v = mu*v - eta*g(*(pt+mu*v))
        plotpt(pt, v)
        pt = pt + v
    plt.title("Nesterov Momentum ($\eta$=%.2f, $\mu$=%.2f)"%(eta,mu));
plot_nesterov(0.6,.9)

plot_nesterov(0.7,0.9)

plot_grad_descent(0.1)

plot_cm(0.1,0.9)

plot_nesterov(0.1,0.9)

%%html
<div id="bpcnt" style="width: 500px"></div>
<script>

    var divid = '#bpcnt';
	jQuery(divid).asker({
	    id: divid,
	    question: "A network has 10 input nodes, two hidden layers each with 10 neurons, and 10 output neurons.  How many parameters does training have to estimate?",
		answers: ["30","100","300","330","600"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

__global__
void add(int n, float *x, float *y)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if ( i < n )
    y[i] = x[i] + y[i];
}

int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);

from numba import cuda
import numpy as np

@cuda.jit
def add(n, x, y):
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if i < n:
        y[i] += x[i]

n = 1000000
x = np.random.random(n); y = np.random.random(n)
blockSize = 256
numBlocks = (n+blockSize-1)//blockSize
add[numBlocks, blockSize](n,x,y)

/home/dkoes/.local/lib/python3.10/site-packages/numba/cuda/cudadrv/devicearray.py:886: NumbaPerformanceWarning: Host array used in CUDA kernel will incur copy overhead to/from device.
  warn(NumbaPerformanceWarning(msg))

xc = cuda.to_device(x)
yc = cuda.to_device(y)

%%timeit
add[numBlocks, blockSize](n,xc,yc)

47.5 µs ± 7.07 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit x=np.random.random(n)
x += y

826 µs ± 55.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import numpy as np
import sklearn
from sklearn.model_selection import KFold

data = np.genfromtxt('bcsmall.csv',delimiter=',',skip_header=True)
X = data[:,2:]
Y = data[:,1]

X,Y

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
        1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
        0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.]))

np.unique(X)

array([0., 1.])

kf = KFold(n_splits=3,shuffle=True)
(train, test) = list(kf.split(data))[0]
Xtrain = X[train]
Ytrain = Y[train]
Xtest = X[test]
Ytest = Y[test]

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(Xtrain,Ytrain)

/home/dkoes/.local/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(

MLPClassifier()

MLPClassifier()

mlp.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

plt.plot(mlp.loss_curve_);

from sklearn import metrics

print("Test on Train AUC",metrics.roc_auc_score(Ytrain,mlp.predict_proba(Xtrain)[:,1]))
print("Held out Test AUC",metrics.roc_auc_score(Ytest,mlp.predict_proba(Xtest)[:,1]))

Test on Train AUC 0.9999416501342047
Held out Test AUC 0.5201485461441213

import numpy as np
import sklearn
from sklearn.model_selection import KFold

data = np.genfromtxt('compounds.fp',comments=None)
X = data[:,2:]
Y = data[:,1]

kf = KFold(n_splits=3,shuffle=True)
(train, test) = list(kf.split(data))[0]
Xtrain = X[train]
Ytrain = Y[train]
Xtest = X[test]
Ytest = Y[test]

mlp = MLPClassifier()
mlp.fit(Xtrain,Ytrain)

MLPClassifier()

MLPClassifier()

plt.plot(mlp.loss_curve_);

print("Test on Train AUC",metrics.roc_auc_score(Ytrain,mlp.predict_proba(Xtrain)[:,1]))
print("Held out Test AUC",metrics.roc_auc_score(Ytest,mlp.predict_proba(Xtest)[:,1]))

Test on Train AUC 1.0
Held out Test AUC 0.9644503696319758

for lr in [0.0001,0.001,0.01,0.1]:
    mlp = MLPClassifier(learning_rate_init=lr,max_iter=200)
    mlp.fit(Xtrain,Ytrain)
    plt.plot(mlp.loss_curve_)
plt.xlabel('Iteration'); plt.ylabel('Loss');

%%html
<div id="lrcolor" style="width: 500px"></div>
<script>

    var divid = '#lrcolor';
	jQuery(divid).asker({
	    id: divid,
	    question: "Which color is from the smallest learning rate?",
		answers: ["blue","orange","red","green"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

%%html
<div id="lrcolor2" style="width: 500px"></div>
<script>

    var divid = '#lrcolor2';
	jQuery(divid).asker({
	    id: divid,
	    question: "Which color is from the largest learning rate?",
		answers: ["blue","orange","red","green"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

for lr in [0.0001,0.001,0.01,0.1]:
    plt.plot([0,1],np.log10([lr,lr]),label=f'{lr}')
plt.legend();

Neural Networks¶

2/15/2024¶

Perceptron¶

Perceptron¶

Perceptron¶

Neurons¶

Activation Functions: Step (Perceptron)¶

Activation Functions: Sigmoid (Logistic)¶

Activation Functions: tanh¶

Activation Functions: ReLU¶

Networks¶

Networks¶

Neural Networks¶

Stochastic Gradient Descent¶

Loss Functions¶

Softmax¶

Momentum¶

Nesterov Momentum¶

Adam¶

Backpropagation¶

Regularization¶

Dropout Regularization¶

Data Augmentation¶

How to Scale It?¶

Fine Grained Parallelism: SIMD¶

Fine Grained Parallelism: GPUs¶

CUDA Programming Model¶

CUDA Programming Model¶

CUDA Programming Model¶

Python CUDA¶

Python CUDA¶

Python CUDA¶

CUDA Programming Model¶

Memory Hierarchy¶

Custom Hardware for Matrix Multiply¶

Course Grained Parallelism (Asynchronous SGD)¶

Synchronous SGD¶

PyTorch Distributed SGD¶

Example¶

Example 2¶

Effect of learning rate¶

Let's Play!¶