获取bingingDB,Pubchem,chembl旗下的拥有uniprot的人蛋白靶点所对应的化合物活性信息,并构建神经网络.
爬取要用数据库数据
·
chembl
因为chembl在这一块很成熟,方便下载,首先爬取了chembl之上的信息.使用到了其官方api :chembl_webresource_client
其教程地址为: https://github.com/chembl/chembl_webresource_client
有预先处理过含有uniprot和target id的表格 ->路径为path_1和输出文件的路径 out_path(自行设置),代码如下:
import pandas as pd
from chembl_webresource_client.new_client import new_client
import os
import time
from multiprocessing import Pool
def get_ac(t_id,out_path):
# get compounds_info associated with targets
print("{} started".format(t_id))
activities = activity.filter(target_chembl_id= t_id)
activities = [i for i in activities]
time.sleep(1)
df_data = pd.DataFrame(activities)
df_data.to_csv(os.path.join(out_path,(t_id+'.csv')))
print("{} finished".format(t_id))
if '__main__' == __name__:
out_path = (#input by yourself)
data_existed = os.listdir(out_path)
data_existed = [i.split('.')[0] for i in data_existed]
target_ids = pd.read_csv(path_1)['ChEMBL ID']
pool = Pool(processes=8)
for t_id in target_ids:
if t_id in data_existed:
print('{} already existed'.format(t_id,out_path))
continue
else:
pool.apply_async(func = get_ac,args = (t_id,out_path))
pool.close()
pool.join()
bindingDB
之后进行bindingDB的爬取,首选api的使用,然后发现他家的api一言难尽.,第一个api获取不完全也懒得改就又出了一个互补的api:
import requests
import pandas as pd
import os
import time
from multiprocessing import Pool
def getHTMLText(url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def get_uni_info(uni,out_path):
file = uni+'.csv'
try:
url = r"http://www.bindingdb.org/axis2/services/BDBService/getLigandsByUniprots?uniprot={}&code=0&response=application/json".format(uni)
HTML = getHTMLText(url)
time.sleep(0.5)
type_list = [i for i in re.findall('<affinity_type>([\w\W]*?)</affinity_type>',HTML)]
smile_list = [i for i in re.findall('<smile>([\w\W]*?)</smile>',HTML)]
affility_list = [i for i in re.findall('<affinity>[\w\W]*?(\d*?)</affinity>',HTML)]
mol_id_list = [i for i in re.findall('<monomerid>([\w\W]*?)</monomerid>',HTML)]
if type_list != []:
uni_dict = ({'UNIPROT':[uni]*len(type_list),
'MOL_ID':mol_id_list,
'SMILE':smile_list,
'TYPE':type_list,
'AFFILITY':affility_list})
pd.DataFrame(uni_dict).to_csv(os.path.join(out_path,file))
print ("Uniprot:{} download finished".format(uni))
except:
try:
url = r"http://www.bindingdb.org/axis2/services/BDBService/getLigandsByUniprot?uniprot={}".format(uni)
HTML = getHTMLText(url)
time.sleep(0.5)
type_list = [i for i in re.findall('<bdb:affinity_type>([\w\W]*?)</bdb:affinity_type>',HTML)]
smile_list = [i for i in re.findall('<bdb:smiles>([\w\W]*?)</bdb:smiles>',HTML)]
affility_list = [i for i in re.findall('<bdb:affinity>[\w\W]*?(\d*?)</bdb:affinity>',HTML)]
mol_id_list = [i for i in re.findall('<bdb:monomerid>([\w\W]*?)</bdb:monomerid>',HTML)]
uni_dict = {'UNIPROT':[uni]*len(type_list),
'MOL_ID':mol_id_list,
'SMILE':smile_list,
'TYPE':type_list,
'AFFILITY':affility_list}
pd.DataFrame(uni_dict).to_csv(os.path.join(out_path,file))
print ("Uniprot:{} download finished".format(uni))
except:
print ("Uniprot:{} can not find".format(uni))
pass
最骚的是什么呢? 发现哪怕用了他的两个api 还是获取信息不完全,虽然有些是网站上真没有一些uniprot的信息,但是有一些unipro信息是存在的但是api未收录,无语只能自己爬了:
def GetInfoFromBindingDB(uniprot):
"""
"""
def _getinfo(TREE,XPATH):
return TREE.xpath(XPATH)
ori_url = 'https://www.bindingdb.org/bind/tabLuceneResult.jsp?thisInput={}&submit=Go'.format(uniprot)
s = requests.Session()
response = s.get(ori_url)
html = response.text
tree = etree.HTML(html)
num = _getinfo(tree, XPATH='//*[@class="red"]/text()')[0]#获取uniprot之下的化合物总数量
new_url = ''.join([response.url,'&Increment={}'.format(num)])#设置网站截距,就不用麻烦设置翻页了
response = s.get(new_url,timeout=120)
html = response.text
tree = etree.HTML(html)
single = _getinfo(tree, XPATH='//*[@class="single"]')[0::3]
double = _getinfo(tree, XPATH='//*[@class="double"]')[0::3]
smile = [j.xpath('td[2]//*[@style="display:none"]')[0].xpath('string(.)') for item in [single,double] for j in item]
mid = [j.xpath('td[2]/a[1]/text()')[0] for item in [single,double] for j in item]
ki = [j.xpath('*[@align="center"]')[0].xpath('string(.)') for item in [single,double] for j in item]
IC50 = [j.xpath('*[@align="center"]')[2].xpath('string(.)') for item in [single,double] for j in item]
kd = [j.xpath('*[@align="center"]')[3].xpath('string(.)') for item in [single,double] for j in item]
EC50_IC50 = [j.xpath('*[@align="center"]')[4].xpath('string(.)') for item in [single,double] for j in item]
out = pd.DataFrame({'UNIPROT': [uniprot]*len(smile)*4,
'MOL_ID': [i for i in mid]*4,
'SMILE': [i for i in smile]*4,
'TYPE': ['ki']*len(ki) + ['IC50']*len(IC50) +['Kd']*len(kd) +['EC50/IC50']*len(EC50_IC50),
'AFFILITY': [i for j in [ki,IC50,kd,EC50_IC50] for i in j]})
#感谢师兄 kotori-Y 的指导附上师兄的github("https://github.com/kotori-y")
Pubchem
因为pubchem是js编写的,所以用的selenium爬.
中途查了一下如何设置下载路径,参考下面的老哥:https://www.cnblogs.com/royfans/p/9210463.html
file_path为输出路径
def getpubchem_info(uni,file_path):
data_exisited = os.listdir(file_path)
if uni not in re.findall("PROTACXN_([\w\W]*?)_pcget_protein_bioactivity",str(data_exisited)):
url = r"https://pubchem.ncbi.nlm.nih.gov/protein/{}#section=Tested-Compounds&fullscreen=true".format(uni)
driver = webdriver.Chrome()
driver.get(url)
try:
time.sleep(5)
driver.find_element_by_css_selector('#Download > span:nth-child(2)').click()
driver.find_element_by_css_selector('a.button > span:nth-child(1)').click()
print("uniprirt: {} finished".format(uni))
time.sleep(30)
driver.close()
except:
print("uniprirt: {} can not find".format(uni))
driver.close()
pass
else:
print("uniprot: {} file alread existed ".format(uni))
构建network并获取其中信息(以bindingDB为例子)
import networkx as nx
import re
import matplotlib.pyplot as plt
def get_uni_dict(path,dose):
# make {'uniprot':[compound_ids]} dicts and set aciive standard ->dose(uM)
files = os.listdir(path)
uni_dict = []
for file in files:
try:
df = pd.read_csv(os.path.join(path,file))
df = df.dropna()
df = df[df['AFFILITY']>=(dose*1000)]
uni_dict.append({np.array(df['UNIPROT'])[0]:set(df['MOL_ID'])})
print('file:{} success'.format(file))
except:
print('file:{} false'.format(file))
pass
return uni_dict
def net_work_info():
# make net_work and get_info
# keys: uniprots associated with compound-id
# id_uni : make dict_list(key= compound-id values = uniprots_list)
# id_uni_num : make dict_list(key= compound-lid values = uniprots_number)
keys = []
values =[]
G = nx.Graph()
id_uni = []
id_uni_num =[]
num=[]
for diction in uni_dict:
for (k,v) in diction.items():
keys.append(k)
G.add_nodes_from(v)
G.add_nodes_from(k)
for i in v:
G.add_edge(k,i)
values.append(i)
print("{} network finished".format(k))
relationship = list(nx.connected_components(G))
values =list(set(values))
for cb_id in values:
id_uni.append({cb_id:[n for n in G.neighbors(cb_id)]})
id_uni_num.append({cb_id:len([n for n in G.neighbors(cb_id)])})
num.append(len([n for n in G.neighbors(cb_id)]))
return keys,values,relationship,id_uni,id_uni_num,num
def num_info():
# get Data distribution
num_list =[]
for i in range(1,21,1):
num_list.extend([{'more than {} targets'.format(i):len([n for n in num if n>=i])}])
all_uni = len(keys)
all_mol = len(id_uni)
return num_list,all_mol,all_uni
def distribution ():
# make Data distribution picture
num_list = []
labels_list = []
for i in range(3,15,1):
num_list.append(len([n for n in num if n>=i]))
labels_list.append("{}".format(str(i)))
plt.scatter(labels_list,[i for i in num_list])
def save_file (path):
os.chdir(path)
for diction in id_uni:
for (k,v) in diction.items():
{k:v.extend([None]*(num[0]-len(v)))}
for i in range(0,len(id_uni),10000):
diction ={}
for dic in id_uni[i:i+10001]:
diction.update(dic)
pd.DataFrame(diction).to_csv(os.path.join(path,'{}_{}'.format(str(i),str(i+10000)+'.csv')))
print("{}-{} finished".format(str(i),str(i+10000)))
def append_uniall_info(in_path,out_path):
# add all_target infomation column
def iter_files(path):
file_names =[]
for root,dirs,files in os.walk(path):
for file in files:
file_name = os.path.join(root,file)
file_names.append(file_name)
return file_names
def get_info(path):
dic,dic_num,smiles = {},{},{}
files = iter_files(in_path)
for file in files:
df =pd.read_csv(file)
df['ID'] = df['ID'].astype(np.str)
dic.update(dict(zip(df['ID'],df['Uniprots'])))
dic_num.update(dict(zip(df['ID'],df['Uniprots_num'])))
smiles.update(dict(zip(df['ID'],df['Smiles'])))
return dic,dic_num,smiles
files = iter_files(out_path)
dic,dic_num,smiles = get_info(in_path)
fail =[]
for file in files:
if re.findall('\d+_\d+.csv',file):
try:
df = pd.read_csv(file)
df.insert(2,'Uniprots_all',df['ID'].apply(lambda x:dic[str(x)]))
df.insert(4,'Uniprots_num_all',df['ID'].apply(lambda x:dic_num[str(x)]))
df.to_csv(os.path.join(out_path,file),index =False)
print(file +' finished')
except Exception as s:
print(s)
print(file)
fail.append(file)
更多推荐
所有评论(0)