Taxonony
NCBI Taxonomy 信息
关联:NR SWissport GO KEGG 已实现相互注释
用法
wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xzf taxdump.tar.gz
- 下载 taxdump.tar.gz
- 用法
- 从names.dmp中找到目标物种的taxid
- 从nodes.dmp中回溯其parent id直至顶端
- 利用names.dmp将parent id翻译为物种名
- 注意,ncbi中可能会有taxa名称相同、但taxid不同的情况,他们就是不同的taxa。
- 另,使用前,也可以删除一些无意义的字符,比如 ' " [] ()
Script1
将信息展开储存:选用names中储存学名的行,nodes的taxid、parent_id、tax_level列
import pandas as pd
import numpy as np
output_file = 'taxid_anno.txt'
names_dir = 'names.dmp'
nodes_dir = 'nodes.dmp'
abbr_dict = dict(zip(
['d','p','c','o','f','g','s'],
['superkingdom','phylum','class','order','family','genus','species']
))
## {'genbank acronym', 'blast name', 'acronym', 'type material', 'common name', 'scientific name', 'synonym', 'genbank common name', 'authority', 'in-part', 'includes', 'equivalent name'}
names_df = pd.read_csv(names_dir,sep = '\t',header = None)[[0,2,6]]
names_df.columns = ['id','Name','Type']
names_df = names_df[names_df['Type'].map(lambda x : 'scientific' in x)]
names_df.set_index('id',inplace = True)
id_name_dict = dict(zip(names_df.index.values,names_df['Name']))
nodes_df = pd.read_csv(nodes_dir,sep = '\t',header = None)[[0,2,4]]
nodes_df.columns = ['id','parent_id','level']
nodes_df.set_index('id',inplace = True)
id_parent_dict = dict(zip(nodes_df.index.values,nodes_df['parent_id']))
id_level_dict = dict(zip(nodes_df.index.values,nodes_df['level']))
def retrieve_ids(taxid,fullid_lst,id_parent_dict):
fullid_lst.append(taxid)
parent = id_parent_dict[taxid]
if taxid == parent:
return fullid_lst
return retrieve_ids(parent,fullid_lst,id_parent_dict)
def retrieve_names(fullid_lst,id_name_dict,id_level_dict): ## several 'no rank' can be overwriten by the last one: 'root'
level_name_dict = {}
for id in fullid_lst:
if id in id_name_dict.keys():
level_name_dict[id_level_dict[id]] = id_name_dict[id]
else:
level_name_dict[id_level_dict[id]] = 'Unknown'
return level_name_dict
def abbr_names(level_name_dict,abbr_dict):
full_str = ''
for k in abbr_dict:
if abbr_dict[k] in level_name_dict.keys():
full_str = full_str + k + '__' + level_name_dict[abbr_dict[k]] + ';'
else:
full_str = full_str + k + '__No_Rank;'
return full_str.strip(';')
with open(output_file,'w') as f:
for taxid in nodes_df.index.values:
fullid_templst = retrieve_ids(taxid,[],id_parent_dict)
level_name_tempdict = retrieve_names(fullid_templst,id_name_dict,id_level_dict)
full_tempstr = abbr_names(level_name_tempdict,abbr_dict)
f.write('{}\t{}\n'.format(taxid,full_tempstr))
Script2
本示例中,由于只关注界门纲目科属种(NCBI中的Domain其实是superkingdom),因此使用dict形式来返回结果,方便提取。若要完全NCBI的taxonomy,改用list或者string就好。
另:uniqName_id_dict_eachlevel存储每个目标level中没有重复的taxa name,若名字重复,只能从names_df里找了,会慢一点。
import pandas as pd
import numpy as np
# grep 'scientific name' names.dmp > names.dmp.scientific
# cut -f 1,3,5 nodes.dmp > nodes.dmp_simplified
names_dir = 'names.dmp.scientific'
nodes_dir = 'nodes.dmp_simplified'
target_level = ['superkingdom','phylum','class','order','family','genus','species']
target_level_abbrev = ['d','p','c','o','f','g','s']
abbrev_dict = dict(zip(target_level,target_level_abbrev))
names_df = pd.read_csv(names_dir,sep = '\t',header = None)[[0,2]]
names_df.columns = ['id','Name']
names_df.set_index('id',inplace = True)
id_names_dict = dict(zip(names_df.index.values,names_df['Name'])) ##
nodes_df = pd.read_csv(nodes_dir,sep = '\t',header = None)
nodes_df.columns = ['id','parent_id','level']
nodes_df.set_index('id',inplace = True)
id_parentid_dict = dict(zip(nodes_df.index.values,nodes_df['parent_id'].values)) ##
id_level_dict = dict(zip(nodes_df.index.values,nodes_df['level'].values)) ##
uniqName_id_dict_eachlevel = {}
for temp_lvl in target_level:
temp_all_ids = nodes_df[nodes_df['level'] == temp_lvl].index.values
temp_names_df = names_df.loc[temp_all_ids].copy()
temp_names_df['id'] = temp_names_df.index.values
temp_val_counts = temp_names_df['Name'].value_counts()
temp_val_counts = temp_val_counts[temp_val_counts == 1]
temp_val_counts = pd.DataFrame({'Name' : temp_val_counts.index.values})
temp_uniq_df = pd.merge(temp_names_df,temp_val_counts,on='Name')
uniqName_id_dict_eachlevel[temp_lvl] = dict(zip(temp_uniq_df['Name'],temp_uniq_df['id']))
def find_tax(taxid,tax_dict,id_names_dict = id_names_dict, id_parentid_dict = id_parentid_dict,id_level_dict=id_level_dict):
temp_name = id_names_dict[taxid]
temp_level = id_level_dict[taxid]
temp_parent_id = id_parentid_dict[taxid]
tax_dict[temp_level] = temp_name
if taxid != temp_parent_id:
tax_dict = find_tax(temp_parent_id,tax_dict)
return tax_dict # this method will loose 'no rank' layers, since there might be multiple of them!
def tax_dict2str(tax_dict,target_level = target_level):
tax_str = ''
for lvl in target_level:
try:
val = abbrev_dict[lvl] + '__' + tax_dict[lvl]+ ';'
except:
val = abbrev_dict[lvl] + '__' + 'norank' + ';'
tax_str += val
tax_str = tax_str.strip(';')
return tax_str
################### Example 1 #####
e1_name = 'Dickeya phage phiDP10.3'
e1_taxid = uniqName_id_dict_eachlevel['species'][e1_name]
e1_taxdict = find_tax(e1_taxid,{})
e1_taxfull = tax_dict2str(e1_taxdict)