Additional material:
Python package for reading and analyzing molecular dynamics trajectories.
A simulation ("universe") is defined by its topology file (prmtop) and trajectory (dcd or nc)
import MDAnalysis
traj1 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_1.dcd')
traj2 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_2.dcd')
traj3 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_3.dcd')
traj = MDAnalysis.Universe('wt/wt_amber.prmtop', ['wt/wt_amber_1.dcd','wt/wt_amber_2.dcd','wt/wt_amber_3.dcd'])
I have downsampled the original simulation and aligned the backbone.
cpptraj
script:
parm wt_amber.prmtop
trajin wt_amber_1_md3.nc 1 last 10
autoimage
rms first @CA
trajout wt_amber_1.dcd dcd nobox
traj1
<Universe with 22827 atoms>
traj1.trajectory
<DCDReader wt/wt_amber_1.dcd with 1000 frames of 22827 atoms>
Root mean squared deviation (RMSD) $$\sqrt{\frac{\sum_i^n(x_i^a-x_i^b)^2+(y_i^a-y_i^b)^2+(z_i^a-z_i^b)^2}{n}}$$
from MDAnalysis.analysis.rms import rmsd #pull in rmsd function
prot = traj.select_atoms("protein") #only care about protein, not water/ions
refcoord = prot.positions #save initial coordinates
rmsds = [rmsd(refcoord,prot.positions) for ts in traj.trajectory] #coordinates implicitly update as you iterate
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.plot(rmsds)
[<matplotlib.lines.Line2D at 0x7fa0c6c25880>]
What does this tell us?
traj.trajectory[-1] #set to last frame
lastcoord = prot.positions
lastrmsds = [rmsd(lastcoord,prot.positions) for ts in traj.trajectory]
plt.plot(rmsds,label='first')
plt.plot(lastrmsds,label='last')
plt.legend();
from MDAnalysis.analysis.align import AlignTraj
#align to loop
align = AlignTraj(traj, traj, select='backbone',filename='aligned.dcd')
align.run()
<MDAnalysis.analysis.align.AlignTraj at 0x7fa07f342c10>
traj = MDAnalysis.Universe('wt/wt_amber.prmtop','aligned.dcd')
prot = traj.select_atoms("protein") #only care about protein, not water/ions
refcoord = prot.positions #save initial coordinates
rmsds = [rmsd(refcoord,prot.positions) for ts in traj.trajectory]
traj.trajectory[-1] #set to last frame
lastcoord = prot.positions
lastrmsds = [rmsd(lastcoord,prot.positions) for ts in traj.trajectory]
plt.plot(rmsds,label='first')
plt.plot(lastrmsds,label='last')
plt.legend();
#loading all the coordinates into memory takes about 1GB
coords = [prot.positions for ts in traj.trajectory]
import multiprocessing
n = len(coords)
def row_rmsds(i):
return [rmsd(coords[i],coords[j]) for j in range(n)]
pool = multiprocessing.Pool()
rmat = pool.map(row_rmsds,range(n))
Making the above twice as fast is left as an exercise for the reader.
import seaborn as sns
rmat = np.array(rmat)
sns.heatmap(rmat,square=True,xticklabels=n,yticklabels=n,cmap='YlGnBu',cbar_kws={'label':'RMSD'},vmin=0);
What does this tell us?
from MDAnalysis.analysis import pca
pc = pca.PCA(traj, select='backbone').run()
plt.plot(pc.cumulated_variance)
[<matplotlib.lines.Line2D at 0x7fa0bbf6dc70>]
backbone = traj.select_atoms('backbone')
backbone
<AtomGroup with 472 atoms>
plt.plot(pc.cumulated_variance[:100])
[<matplotlib.lines.Line2D at 0x7fa0ca91f5b0>]
transformed = pc.transform(backbone, n_components=10)
traj.trajectory
<DCDReader aligned.dcd with 3000 frames of 22827 atoms>
import pandas as pd
df = pd.DataFrame(transformed,
columns=['PC{}'.format(i+1) for i in range(10)])
df['sim'] = df.index//1000
df['Time (ps)'] = df.index * u.trajectory.dt
df
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | sim | Time (ps) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2.901878 | -1.381989 | -15.505499 | 0.713734 | 1.151404 | 3.032116 | 0.519691 | 0.298446 | 2.686070 | -0.059783 | 0 | 0.000000 |
1 | -2.390118 | -4.101201 | -13.187061 | 3.305353 | 0.362939 | -1.099120 | 1.617562 | -2.974982 | 4.230857 | -4.545147 | 0 | 0.000049 |
2 | -3.225154 | -3.136984 | -12.841184 | 2.945615 | 0.070979 | -4.939210 | 1.841554 | -1.970189 | -1.335142 | -1.088082 | 0 | 0.000098 |
3 | -9.836594 | -0.372700 | -14.308532 | 6.051637 | 1.660262 | -1.978671 | 0.584544 | -1.687415 | -2.311352 | 0.821889 | 0 | 0.000147 |
4 | -12.443729 | -4.770219 | -11.416638 | 6.836731 | 0.046444 | -7.337171 | -1.570999 | -3.827739 | -2.669364 | 4.712605 | 0 | 0.000196 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2995 | 7.721263 | 5.326934 | -4.120338 | -9.001223 | 0.281569 | 0.095127 | -1.617441 | -8.948532 | -3.191316 | -1.071643 | 2 | 0.146420 |
2996 | 9.301868 | 4.756724 | -1.827657 | -5.620303 | 0.258789 | 0.510859 | -0.734844 | -7.921880 | -4.532978 | 2.652904 | 2 | 0.146469 |
2997 | 5.053161 | 7.434399 | -1.242725 | -2.823438 | -3.119645 | -1.771552 | -0.519824 | -4.979009 | -5.458433 | -1.898075 | 2 | 0.146518 |
2998 | 7.672216 | 10.353063 | 0.568196 | -5.735955 | -1.294599 | -0.749878 | 0.627956 | -7.772218 | -4.710042 | 2.179812 | 2 | 0.146567 |
2999 | 0.616314 | 3.146817 | 1.163139 | -2.191043 | -1.776522 | -4.887624 | -0.758012 | -6.467207 | -5.361442 | -2.063734 | 2 | 0.146616 |
3000 rows × 12 columns
g = sns.pairplot(df,vars=('PC1','PC2','PC3','PC4','PC5'),hue='sim')
How much does a residue deviate from its mean coordinates?
from MDAnalysis.analysis import align
from MDAnalysis.analysis.rms import RMSF
def get_rmsf(t,label=None):
sel = 'protein and not name H*'
prot = t.select_atoms(sel)
average_coordinates = t.trajectory.timeseries(asel=prot).mean(axis=1)
# make a reference structure (need to reshape into a 1-frame "trajectory")
reference = MDAnalysis.Merge(prot).load_new(average_coordinates[:, None, :], order="afc")
aligner = align.AlignTraj(t, reference, select=sel, in_memory=True).run()
rmsfer = RMSF(prot).run()
ret = pd.DataFrame(zip(prot.resnums,rmsfer.rmsf),columns=('resid','rmsf'))
if label != None:
ret['label'] = label
return ret
rmsf1 = get_rmsf(traj1,1)
rmsf2 = get_rmsf(traj2,2)
rmsf3 = get_rmsf(traj3,3)
rmsf = pd.concat([rmsf1,rmsf2,rmsf3])
/usr/local/lib/python3.8/dist-packages/MDAnalysis/core/topologyattrs.py:2011: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray np.array(sorted(unique_bonds)), 4)
import warnings; warnings.simplefilter('ignore')
fig, ax = plt.subplots(figsize=(14,3))
sns.swarmplot(data=rmsf,x='resid',y='rmsf',hue='label',dodge=True,size=1,ax=ax)
ax.set_xticklabels(ax.get_xticks(), rotation = 90);
d = rmsf.groupby(['resid','label']).mean().reset_index()
fig, ax = plt.subplots(figsize=(14,3))
sns.swarmplot(data=d,x='resid',y='rmsf',hue='label',ax=ax)
ax.set_xticklabels(ax.get_xticks(), rotation = 90);
Given multiple samples from distributions we can compare to the null hypothesis that the two samples have the same mean.
from scipy.stats import ttest_ind
ttest_ind([0.9,1,1.1],[1,1,1])
Ttest_indResult(statistic=0.0, pvalue=1.0)
ttest_ind([0.9,1,1.1],[2,1.5,3])
Ttest_indResult(statistic=-2.623360911485515, pvalue=0.058592469474123776)
Even with only three samples, we can compute statistical significance - but need another distribution to compare against (the mutation simulations).
A contact just means two atoms are within a specified distance of one another.
from MDAnalysis.analysis import distances
protein = t.select_atoms('protein')
self_distances = distances.self_distance_array(protein.positions)
self_distances.shape
(1675365,)
n = len(protein)
sq_dist_arr = np.zeros((n, n))
triu = np.triu_indices_from(sq_dist_arr, k=1)
sq_dist_arr[triu] = self_distances
sq_dist_arr.T[triu] = self_distances
plt.figure(figsize=(8,8))
sns.heatmap(sq_dist_arr,square=True,xticklabels=n,yticklabels=n,cmap='YlGnBu',cbar_kws={'label':'RMSD'},vmin=0);
Given a contact distance of 2A let's count the number of intra-molecular contacts each residue is making.
We are applying a hard cutoff. Soft cutoffs could also be implemented.
cutoff = 2.0
contacts = np.where(sq_dist_arr < cutoff, 1, 0)
df = pd.DataFrame(zip(protein.resids,contacts.sum(axis=0)),columns=('resid','contacts'))
rescontacts = df.groupby('resid').sum().reset_index()
rescontacts
resid | contacts | |
---|---|---|
0 | 1 | 83 |
1 | 2 | 42 |
2 | 3 | 53 |
3 | 4 | 54 |
4 | 5 | 52 |
... | ... | ... |
113 | 114 | 72 |
114 | 115 | 38 |
115 | 116 | 70 |
116 | 117 | 37 |
117 | 118 | 26 |
118 rows × 2 columns
def get_rescontacts(protein,cutoff=2):
'''Given an mdanalysis selection and contact distance cutoff,
count the number of internal contacts for each residue.'''
n = len(protein)
#calc distances
self_distances = distances.self_distance_array(protein.positions)
#make square array
sq_dist_arr = np.zeros((n, n))
triu = np.triu_indices_from(sq_dist_arr, k=1)
sq_dist_arr[triu] = self_distances
sq_dist_arr.T[triu] = self_distances
#binarize the matrix
contacts = np.where(sq_dist_arr < cutoff, 1, 0)
df = pd.DataFrame(zip(protein.resids,contacts.sum(axis=0)),columns=('resid','contacts'))
rescontacts = df.groupby('resid').sum().reset_index()
return rescontacts
Is this counting the number of atoms within cutoff of each residue?
traj1contacts = []
protein = traj1.select_atoms('protein')
for ts in traj1.trajectory:
rc = get_rescontacts(protein)
traj1contacts.append(rc.contacts.to_numpy())
traj2contacts = []
protein = traj2.select_atoms('protein')
for ts in traj2.trajectory:
rc = get_rescontacts(protein)
traj2contacts.append(rc.contacts.to_numpy())
traj1contacts = np.array(traj1contacts)
traj2contacts = np.array(traj2contacts)
Parallelizing the above is left as an exercise for the reader.
plt.figure(figsize=(14,3))
plt.plot(traj1contacts.mean(axis=0),'o',label='traj1')
plt.plot(traj2contacts.mean(axis=0),'o',label='traj2',alpha=.5)
plt.xlabel('res index')
plt.ylabel('mean res contacts');
traj1contacts[:,0][:10]
array([83, 84, 82, 84, 85, 84, 84, 83, 85, 85])
traj2contacts[:,0][:10]
array([83, 83, 85, 84, 82, 84, 82, 82, 83, 83])
plt.figure(figsize=(14,3))
plt.plot(traj1contacts.mean(axis=0)-traj1contacts[0,:],'o',label='traj1')
plt.plot(traj2contacts.mean(axis=0)-traj1contacts[0,:],'o',label='traj2',alpha=.5)
plt.xlabel('res index')
plt.ylabel('mean diff contacts');
We are comparing means - more informative would be to compare the underlying distributions.
def plt_resc_hist(i):
low = min(traj1contacts[:,i].min(),traj2contacts[:,i].min())
hi = max(traj1contacts[:,i].max(),traj2contacts[:,i].max())+1
plt.hist(traj1contacts[:,i],bins=range(low,hi),alpha=.5)
plt.hist(traj2contacts[:,i],bins=range(low,hi),alpha=.5);
plt_resc_hist(75)
plt_resc_hist(80)
from scipy.stats import ks_2samp
help(ks_2samp)
Help on function ks_2samp in module scipy.stats.stats: ks_2samp(data1, data2, alternative='two-sided', mode='auto') Compute the Kolmogorov-Smirnov statistic on 2 samples. This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution. The alternative hypothesis can be either 'two-sided' (default), 'less' or 'greater'. Parameters ---------- data1, data2 : sequence of 1-D ndarrays Two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different. alternative : {'two-sided', 'less', 'greater'}, optional Defines the alternative hypothesis (see explanation above). Default is 'two-sided'. mode : {'auto', 'exact', 'asymp'}, optional Defines the method used for calculating the p-value. Default is 'auto'. - 'exact' : use approximation to exact distribution of test statistic - 'asymp' : use asymptotic distribution of test statistic - 'auto' : use 'exact' for small size arrays, 'asymp' for large. Returns ------- statistic : float KS statistic pvalue : float two-tailed p-value Notes ----- This tests whether 2 samples are drawn from the same distribution. Note that, like in the case of the one-sample K-S test, the distribution is assumed to be continuous. In the one-sided test, the alternative is that the empirical cumulative distribution function F(x) of the data1 variable is "less" or "greater" than the empirical cumulative distribution function G(x) of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``. If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same. If the mode is 'auto', the computation is exact if the sample sizes are less than 10000. For larger sizes, the computation uses the Kolmogorov-Smirnov distributions to compute an approximate value. We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_. References ---------- .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. Examples -------- >>> from scipy import stats >>> np.random.seed(12345678) #fix random seed to get the same result >>> n1 = 200 # size of first sample >>> n2 = 300 # size of second sample For a different distribution, we can reject the null hypothesis since the pvalue is below 1%: >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) >>> stats.ks_2samp(rvs1, rvs2) (0.20833333333333334, 5.129279597781977e-05) For a slightly different distribution, we cannot reject the null hypothesis at a 10% or lower alpha since the p-value at 0.144 is higher than 10% >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) >>> stats.ks_2samp(rvs1, rvs3) (0.10333333333333333, 0.14691437867433876) For an identical distribution, we cannot reject the null hypothesis since the p-value is high, 41%: >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) >>> stats.ks_2samp(rvs1, rvs4) (0.07999999999999996, 0.41126949729859719)
ks_2samp(traj1contacts[:,75],traj2contacts[:,75])
Ks_2sampResult(statistic=0.15, pvalue=3.143161402242108e-10)
ks_2samp(traj1contacts[:,80],traj2contacts[:,80])
Ks_2sampResult(statistic=0.015, pvalue=0.9998734489003387)
ks = []
for i in range(traj1contacts.shape[1]):
ks.append(ks_2samp(traj1contacts[:,i],traj2contacts[:,i]))
ks = np.array(ks)
plt.figure(figsize=(14,3))
plt.scatter(range(len(ks)),ks[:,0],c=ks[:,1])
cbar = plt.colorbar()
cbar.set_label("pvalue")
plt.xlabel('res index')
plt.ylabel('test statistic');
Water is important. Does the per-residue solvent accessibility change? Let's count contacts with the explicit water molecules in the simulation.
def get_res_waters(traj,cutoff=2):
'''Given an mdanalysis trajectory and contact distance cutoff,
count the number of water molecules within cutoff of each residue'''
protein = traj.select_atoms('protein')
resids = np.unique(protein.resids)
ret = []
for rid in resids:
wat = traj.select_atoms('resname WAT and around %f resid %d'%(cutoff,rid))
ret.append((rid,len(wat)))
return pd.DataFrame(ret,columns=('resid','waters'))
get_res_waters(traj1)
resid | waters | |
---|---|---|
0 | 1 | 0 |
1 | 2 | 3 |
2 | 3 | 4 |
3 | 4 | 2 |
4 | 5 | 2 |
... | ... | ... |
113 | 114 | 0 |
114 | 115 | 0 |
115 | 116 | 0 |
116 | 117 | 1 |
117 | 118 | 4 |
118 rows × 2 columns
def traj1_frame_waters(i):
traj1.trajectory[i]
rc = get_res_waters(traj1)
return rc.waters.to_numpy()
pool = multiprocessing.Pool()
traj1waters = pool.map(traj1_frame_waters,range(traj1.trajectory.n_frames))
def traj2_frame_waters(i):
traj2.trajectory[i]
rc = get_res_waters(traj2)
return rc.waters.to_numpy()
pool = multiprocessing.Pool()
traj2waters = pool.map(traj2_frame_waters,range(traj2.trajectory.n_frames))
traj1waters = np.array(traj1waters)
traj2waters = np.array(traj2waters)
We can do the same sorts of analyses as with contacts:
plt.figure(figsize=(14,3))
plt.plot(traj1waters.mean(axis=0),'o',label='traj1')
plt.plot(traj2waters.mean(axis=0),'o',label='traj2',alpha=.5)
plt.xlabel('res index')
plt.ylabel('mean res waters');
traj1.add_TopologyAttr('tempfactors')
protein = traj1.select_atoms('protein')
for residue, nwats in zip(protein.residues, traj1waters.mean(axis=0)):
residue.atoms.tempfactors = nwats
protein.write('wt_wat.pdb')
import py3Dmol
v = py3Dmol.view()
v.addModel(open('wt_wat.pdb').read())
v.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient':'sinebow'}},
'stick': {'colorscheme': {'prop':'b','gradient':'sinebow'}}})
v.zoomTo()
You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
jupyter labextension install jupyterlab_3dmol
<py3Dmol.view at 0x7f9ebdbcfc70>