基于机器学习的化合物活性预测模型
【摘要】
利用化合物的结构与活性数据,基于RDKit和Python3的机器学习活性预测模型小示例。
代码示例:
#导入必须的包#!/usr/bin/env python3from rdkit.Chem import Descriptorsfrom rdkit.Chem import AllChem as chfrom rdkit.Chem import Draw as d...
利用化合物的结构与活性数据,基于RDKit和Python3的机器学习活性预测模型小示例。
代码示例:
-
#导入必须的包
-
#!/usr/bin/env python3
-
from rdkit.Chem import Descriptors
-
from rdkit.Chem import AllChem as ch
-
from rdkit.Chem import Draw as d
-
from rdkit import DataStructs
-
import pandas as pd
-
from rdkit.Chem import rdMolDescriptors as rdescriptors
-
from matplotlib.mlab import PCA
-
import matplotlib.pyplot as plt
-
import csv
-
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
-
import sklearn
-
from rdkit.Chem import PandasTools, Descriptors, MolFromSmiles
-
from pandas import DataFrame
-
from sklearn.model_selection import train_test_split
-
from sklearn import svm
-
import numpy as np
-
from sklearn.metrics import mean_squared_error
-
import matplotlib.pyplot as plt
-
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
-
#载入数据
-
with open ('receptor-bioactivity.txt', 'r') as f:
-
content_raw = list((csv.reader(f, delimiter = '\t')))
-
len(content_raw)
-
#移除重复数据
-
content=[]
-
for i in range(0,len(content_raw)):
-
if i == 0:
-
chembl_id=content_raw[i][0]
-
content.append(content_raw[i])
-
elif content_raw[i][0]!=chembl_id:
-
chembl_id=content_raw[i][0]
-
content.append(content_raw[i])
-
#提取有用信息
-
names = []
-
smiles = []
-
activity = []
-
mols = []
-
for i in range(1,len(content)):
-
names.append(content[i][0])
-
smiles.append(content[i][5])
-
activity.append(content[i][7])
-
mols.append(ch.MolFromSmiles(content[i][5]))
-
#为深入分析创建数据集
-
def HBA(mol):
-
return rdescriptors.CalcNumLipinskiHBA(mol)
-
def HBD(mol):
-
return rdescriptors.CalcNumLipinskiHBD(mol)
-
def logP(mol):
-
return Descriptors.MolLogP(mol)
-
def TPSA(mol):
-
return Descriptors.TPSA(mol)
-
def num_rotatable_bonds(mol):
-
return Descriptors.NumRotatableBonds(mol)
-
def num_heavy_atoms(mol):
-
return mol.GetNumHeavyAtoms()
-
def MW(mol):
-
return Descriptors.MolWt(mol)
-
data=[]
-
for i,mol in enumerate(mols):
-
data.append([names[i], float(activity[i]), HBA(mol), HBD(mol), float(MW(mol)), logP(mol),float(TPSA(mol)),num_rotatable_bonds(mol),num_heavy_atoms(mol),smiles[i]])
-
dataframe=pd.DataFrame(data,columns=["CHEMBL_ID", "activity", "HBA", "HBD", "MW", "logP", "TPSA","rb",'smiles'])
-
dataframe.set_index("CHEMBL_ID",inplace=True)
PCA分析,数据降维也称主成分分析
-
#PCA分析
-
pca1=PCA(dataframe.drop(['smiles'],axis=1))
-
plt.rcParams["figure.figsize"] = [15, 15]
-
plt.style.use('ggplot')
-
fig = plt.figure()
-
ax = fig.add_subplot(111)
-
ax.set_title('Ghrelin Receptor Homo sapiens PCA')
-
ax.set_xlabel('PC1')
-
ax.set_ylabel('PC2')
-
X = [x[0] for x in pca1.Y]
-
Y = [y[1] for y in pca1.Y]
-
plt.scatter(X,Y)
-
plt.show()
-
#运用随机森林模型,并为其选择有用数据
-
model=dataframe.loc[:,["smiles", "activity"]]
-
desc_list = Descriptors.descList
-
model["pic50"] = model.activity.apply(lambda x : -1.0 * np.log10(x / 1.0e9))
-
for desc_name, function in desc_list:
-
values = []
-
for smiles in model["smiles"]:
-
mol = MolFromSmiles(smiles)
-
values.append(function(mol))
-
model[desc_name] = values
-
columns = [x[0] for x in desc_list[:30]]
-
#划分数据集,训练模型
-
train_data, test_data = train_test_split(model, test_size=0.3)
-
model2 = RandomForestRegressor(n_estimators=15)
-
model2.fit(train_data[columns], train_data["pic50"])
-
#测试模型,绘图
-
plt.rcParams["figure.figsize"] = [15, 15]
-
span = (1,12)
-
axes = plt.gca()
-
axes.set_xlim(span)
-
axes.set_ylim(span)
-
plt.plot((span[0],span[1]), (span[0],span[1]), linestyle='--')
-
plt.scatter(
-
test_data["pic50"]
-
, model2.predict(test_data[columns])
-
, c='blue'
-
, s=20
-
)
-
plt.show()
文章来源: drugai.blog.csdn.net,作者:DrugAI,版权归原作者所有,如需转载,请联系作者。
原文链接:drugai.blog.csdn.net/article/details/105683679
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)