Additional material:
Python package for reading and analyzing molecular dynamics trajectories.
A simulation ("universe") is defined by its topology file (prmtop) and trajectory (dcd or nc)
import MDAnalysis
traj1 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_1.dcd')
traj2 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_2.dcd')
traj3 = MDAnalysis.Universe('wt/wt_amber.prmtop', 'wt/wt_amber_3.dcd')
traj = MDAnalysis.Universe('wt/wt_amber.prmtop', ['wt/wt_amber_1.dcd','wt/wt_amber_2.dcd','wt/wt_amber_3.dcd'])
I have downsampled the original simulation and aligned the backbone.
cpptraj
script:
parm wt_amber.prmtop
trajin wt_amber_1_md3.nc 1 last 10
autoimage
rms first @CA
trajout wt_amber_1.dcd dcd nobox
traj1
<Universe with 22827 atoms>
traj1.trajectory
<DCDReader wt/wt_amber_1.dcd with 1000 frames of 22827 atoms>
Root mean squared deviation (RMSD) $$\sqrt{\frac{\sum_i^n(x_i^a-x_i^b)^2+(y_i^a-y_i^b)^2+(z_i^a-z_i^b)^2}{n}}$$
from MDAnalysis.analysis.rms import rmsd #pull in rmsd function
prot = traj.select_atoms("protein") #only care about protein, not water/ions
refcoord = prot.positions #save initial coordinates
rmsds = [rmsd(refcoord,prot.positions) for ts in traj.trajectory] #coordinates implicitly update as you iterate
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.plot(rmsds)
[<matplotlib.lines.Line2D at 0x7fa0c6c25880>]
What does this tell us?
traj.trajectory[-1] #set to last frame
lastcoord = prot.positions
lastrmsds = [rmsd(lastcoord,prot.positions) for ts in traj.trajectory]
plt.plot(rmsds,label='first')
plt.plot(lastrmsds,label='last')
plt.legend();
from MDAnalysis.analysis.align import AlignTraj
#align to loop
align = AlignTraj(traj, traj, select='backbone',filename='aligned.dcd')
align.run()
<MDAnalysis.analysis.align.AlignTraj at 0x7fa07f342c10>
traj = MDAnalysis.Universe('wt/wt_amber.prmtop','aligned.dcd')
prot = traj.select_atoms("protein") #only care about protein, not water/ions
refcoord = prot.positions #save initial coordinates
rmsds = [rmsd(refcoord,prot.positions) for ts in traj.trajectory]
traj.trajectory[-1] #set to last frame
lastcoord = prot.positions
lastrmsds = [rmsd(lastcoord,prot.positions) for ts in traj.trajectory]
plt.plot(rmsds,label='first')
plt.plot(lastrmsds,label='last')
plt.legend();
#loading all the coordinates into memory takes about 1GB
coords = [prot.positions for ts in traj.trajectory]
import multiprocessing
n = len(coords)
def row_rmsds(i):
return [rmsd(coords[i],coords[j]) for j in range(n)]
pool = multiprocessing.Pool()
rmat = pool.map(row_rmsds,range(n))
Making the above twice as fast is left as an exercise for the reader.
import seaborn as sns
rmat = np.array(rmat)
sns.heatmap(rmat,square=True,xticklabels=n,yticklabels=n,cmap='YlGnBu',cbar_kws={'label':'RMSD'},vmin=0);
What does this tell us?
from MDAnalysis.analysis import pca
pc = pca.PCA(traj, select='backbone').run()
plt.plot(pc.cumulated_variance)
[<matplotlib.lines.Line2D at 0x7fa0bbf6dc70>]
backbone = traj.select_atoms('backbone')
backbone
<AtomGroup with 472 atoms>
plt.plot(pc.cumulated_variance[:100])
[<matplotlib.lines.Line2D at 0x7fa0ca91f5b0>]
transformed = pc.transform(backbone, n_components=10)
traj.trajectory
<DCDReader aligned.dcd with 3000 frames of 22827 atoms>
import pandas as pd
df = pd.DataFrame(transformed,
columns=['PC{}'.format(i+1) for i in range(10)])
df['sim'] = df.index//1000
df['Time (ps)'] = df.index * u.trajectory.dt
df
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | sim | Time (ps) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2.901878 | -1.381989 | -15.505499 | 0.713734 | 1.151404 | 3.032116 | 0.519691 | 0.298446 | 2.686070 | -0.059783 | 0 | 0.000000 |
1 | -2.390118 | -4.101201 | -13.187061 | 3.305353 | 0.362939 | -1.099120 | 1.617562 | -2.974982 | 4.230857 | -4.545147 | 0 | 0.000049 |
2 | -3.225154 | -3.136984 | -12.841184 | 2.945615 | 0.070979 | -4.939210 | 1.841554 | -1.970189 | -1.335142 | -1.088082 | 0 | 0.000098 |
3 | -9.836594 | -0.372700 | -14.308532 | 6.051637 | 1.660262 | -1.978671 | 0.584544 | -1.687415 | -2.311352 | 0.821889 | 0 | 0.000147 |
4 | -12.443729 | -4.770219 | -11.416638 | 6.836731 | 0.046444 | -7.337171 | -1.570999 | -3.827739 | -2.669364 | 4.712605 | 0 | 0.000196 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2995 | 7.721263 | 5.326934 | -4.120338 | -9.001223 | 0.281569 | 0.095127 | -1.617441 | -8.948532 | -3.191316 | -1.071643 | 2 | 0.146420 |
2996 | 9.301868 | 4.756724 | -1.827657 | -5.620303 | 0.258789 | 0.510859 | -0.734844 | -7.921880 | -4.532978 | 2.652904 | 2 | 0.146469 |
2997 | 5.053161 | 7.434399 | -1.242725 | -2.823438 | -3.119645 | -1.771552 | -0.519824 | -4.979009 | -5.458433 | -1.898075 | 2 | 0.146518 |
2998 | 7.672216 | 10.353063 | 0.568196 | -5.735955 | -1.294599 | -0.749878 | 0.627956 | -7.772218 | -4.710042 | 2.179812 | 2 | 0.146567 |
2999 | 0.616314 | 3.146817 | 1.163139 | -2.191043 | -1.776522 | -4.887624 | -0.758012 | -6.467207 | -5.361442 | -2.063734 | 2 | 0.146616 |
3000 rows × 12 columns
g = sns.pairplot(df,vars=('PC1','PC2','PC3','PC4','PC5'),hue='sim')