NCBI如何批量下载数据

发布时间：2022-02-23 10:38:27 作者：小新
来源：亿速云阅读：664

这篇文章给大家分享的是有关NCBI如何批量下载数据的内容。小编觉得挺实用的，因此分享给大家做个参考，一起跟随小编过来看看吧。

NCBI批量搜索、下载序列

脚本代码：

from Bio import Entrez
import os,sys
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
import sys, os, argparse, os.path,re,math,time
'''
database:
['pubmed', 'protein', 'nucleotide', 'nuccore', 'nucgss', 'nucest',
'structure', 'genome', 'books', 'cancerchromosomes', 'cdd', 'gap',
'domains', 'gene', 'genomeprj', 'gensat', 'geo', 'gds', 'homologene',
'journals', 'mesh', 'ncbisearch', 'nlmcatalog', 'omia', 'omim', 'pmc',
'popset', 'probe', 'proteinclusters', 'pcassay', 'pccompound',
'pcsubstance', 'snp', 'taxonomy', 'toolkit', 'unigene', 'unists']
'''
parser = argparse.ArgumentParser(description='This script is used to fasta from ncbi ')
parser.add_argument('-t','--term',help='input search  term : https://www.ncbi.nlm.nih.gov/books/NBK3837/#_EntrezHelp_Entrez_Searching_Options_',required=True)
parser.add_argument('-d','--database',help='Please input database to search nucleotide or protein  default nucleotide',default = 'nucleotide',required=False)
parser.add_argument('-r','--rettype',help='return type fasta or gb default gb',default = "gb",required=False)
parser.add_argument('-o','--out_dir',help='Please input  out_put directory path',default = os.getcwd(),required=False)
parser.add_argument('-n','--name',default ='seq',required=False,help='Please specify the output, seq')
args = parser.parse_args()
dout=''

if os.path.exists(args.out_dir):
    dout=os.path.abspath(args.out_dir)
else:
    os.mkdir(args.out_dir)
    dout=os.path.abspath(args.out_dir)
output_handle = open(dout+'/'+args.name+'.%s'%args.rettype, "w")
Entrez.email = "huangls@biomics.com.cn"     # Always tell NCBI who you are
#handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
#print(handle.read())
handle = Entrez.esearch(db=args.database, term=args.term, idtype="acc")
record = Entrez.read(handle)

for i in record['IdList']:
    print i+'\n'
    handle = Entrez.efetch(db=args.database, id=i, rettype=args.rettype, retmode="text")
    #print(handle.read())
    record = SeqIO.read(handle, args.rettype)
    SeqIO.write(record, output_handle, args.rettype)

output_handle.close()

帮助文档：

 1python /share/work/huangls/piplines/01.script/search_NCBI.py -h
 2usage: search_NCBI.py [-h] -t TERM [-d DATABASE] [-r RETTYPE] [-o OUT_DIR]
 3                      [-n NAME]
 4This script is used to fasta from ncbi
 5optional arguments:
 6  -h, --help            show this help message and exit
 7  -t TERM, --term TERM  input search term : https://www.ncbi.nlm.nih.gov/books
 8                        /NBK3837/#_EntrezHelp_Entrez_Searching_Options_
 9  -d DATABASE, --database DATABASE
10                        Please input database to search nucleotide or protein
11                        default nucleotide
12  -r RETTYPE, --rettype RETTYPE
13                        return type fasta or gb default gb
14  -o OUT_DIR, --out_dir OUT_DIR
15                        Please input out_put directory path
16  -n NAME, --name NAME  Please specify the output, seq

使用说明：

先来看一个示例：

python search_NCBI.py -t "Polygonatum[Organism] AND chloroplast AND PsaA" -d protein -r fasta -n psaA

该命令是从NCBI的蛋白质数据库下载所有黄精属中叶绿体上的PsaA基因的蛋白序列，输出格式为fasta。

-t：后面跟的是搜索条件，用双引号引起来。我们可以用布尔运算符和索引构建器更精确查找内容。先来介绍下布尔运算符，布尔运算符提供了一种生成精确查询的方法，可以产生定义良好的结果集。布尔运算符主要有3个，分别是AND、OR和NOT。它们的工作原理如下：

NCBI如何批量下载数据

AND运算符是必须大写的，而OR和NOT不是必须的，但是建议三种运算符都用大写。

布尔运算符的运算顺序都是从左往右，例如：

promoters OR response elements NOT human AND mammals

表示查询除人类外的哺乳类动物中的promoters或response elements。而使用括号可以改变运算顺序，例如：

promoters OR response elements NOT （human OR mouse）AND mammals

表示查询除人类和老鼠外的哺乳类动物中的promoters或response elements。

"[ ]"里的内容是索引构建器，可以解释前面搜索词的类型，如示例中的[Organism]表示前面的Polygonatum是一个有机体。下面是一些其它示例：

NCBI如何批量下载数据

此外，还能进行范围的搜索，例如序列长度和发表日期。

NCBI如何批量下载数据

-d：后面跟搜索数据库，nucleotide 或 protein，默认 nucleotide。

-r：后面跟输出格式，fasta 或 gb（genbank），默认 gb。

-o：后面跟输出目录。

-n：后面跟输出文件名前缀。

从genbank提取序列

再给大家安利一个python程序，该程序可以根据提供的基因名列表，从genbank文件中提取基因组序列、有关基因的cds和蛋白序列、基因的位置信息，分别存放在 *.gb.genome.fa 、 *.gb.cds.fa 、 *.gb.pep.fa 、 *.gb.cds_location.txt 文件中。

脚本代码：

import sys, os, argparse, os.path ,glob
from Bio import SeqIO

parser = argparse.ArgumentParser(description='This script was used to get fa from genbank file; *.faa =pep file; *.ffn=cds file; *fna=genome fa file')
parser.add_argument('-i','--id',help='Please input gene list file',required=True)
parser.add_argument('-m','--in_dir',help='Please input  in_put directory path;default cwd',default = os.getcwd(),required=False)
parser.add_argument('-o','--out_dir',help='Please input  out_put directory path;default cwd',default = os.getcwd(),required=False)
args = parser.parse_args()
dout=''
din=''
if os.path.exists(args.in_dir):
    din=os.path.abspath(args.in_dir)
if os.path.exists(args.out_dir):
    dout=os.path.abspath(args.out_dir)
else:
    os.mkdir(args.out_dir)
    dout=os.path.abspath(args.out_dir)
args.id=os.path.abspath(args.id)
gene = {}
input = open(args.id, "r")
for line in input :
    line = line.strip()
    gene[line] = line
genbank=glob.glob(din+"/*gb")
for gdkfile in genbank :
    name = os.path.basename(gdkfile)
    input_handle  = open(gdkfile, "r")
    pep_file = dout+'/'+name+".pep.fa"
    genePEP = open(pep_file, "w")
    cds_file = dout+'/'+name+".cds.fa"
    geneCDS = open(cds_file, "w")
    gene_file = dout+'/'+name+".genome.fa"
    gene_handle = open(gene_file, "w")
    cds_locat_file = dout+'/'+name+".cds_location.txt"
    cds_locat_handle = open(cds_locat_file, "w")
    for seq_record in SeqIO.parse(input_handle, "genbank") :
        print "Dealing with GenBank record %s" % seq_record.id
        gene_handle.write(">%s %s\n%s\n" % (
            seq_record.id,
            seq_record.description,
            seq_record.seq))
        for seq_feature in seq_record.features :
            geneSeq = seq_feature.extract(seq_record.seq)
            if seq_feature.type=="CDS" :
                assert len(seq_feature.qualifiers['translation'])==1
                if gene.has_key(seq_feature.qualifiers['gene'][0]):
                    genePEP.write(">%s\n%s\n" % (
                        seq_feature.qualifiers['gene'][0],
                        #seq_record.name,
                        seq_feature.qualifiers['translation'][0]))
                    geneCDS.write(">%s\n%s\n" % (
                        seq_feature.qualifiers['gene'][0],
                        #seq_record.name,
                        geneSeq ))
                    cds_locat_handle.write(">%s location %s\n" % (
                        seq_feature.qualifiers['gene'][0],
                        seq_feature.location ))
    input_handle.close()
    genePEP.close()
    geneCDS.close()

帮助文档：

python /share/work/wangq/script/genbank/genbank.py 
usage: get_data_NCBI.py -i IDLIST -o OUT_DIR -m IN_DIR
optional arguments:
  -i IDLIST, --idlist IDLIST
                        Please gene name list file
  -m IN_DIR, --in_dir IN_DIR
                        Please input complete in_put directory path
  -o OUT_DIR, --out_dir OUT_DIR
                        Please input complete out_put directory path
例：python /share/work/wangq/script/genbank/genbank.py -i id.txt -m /share/nas1/wangq/work/NCBI_download -o /share/nas1/wangq/work/NCBI_download

注意：-m 后输入的是一目录，该目录下可以有多个 genbank 文件，程序会批量读取。-i 后跟需提取的基因名称列表，格式如下：

rpl2
psbA
ndhD
ndhF

genbank转gff3

最后一个脚本 bp_genbank2gff3.pl，此脚本可以根据 genbank 文件生成 gff3 文件，由Bioperl提供，安装并配置过 Bioperl 就可以直接使用。用法也很简单，bp_genbank2gff3.pl 后跟 genbank 文件就可以啦！

bp_genbank2gff3.pl  filename(s)

感谢各位的阅读！关于“NCBI如何批量下载数据”这篇文章就分享到这里了，希望以上内容可以对大家有一定的帮助，让大家可以学到更多知识，如果觉得文章不错，可以把它分享出去让更多的人看到吧！

NCBI如何批量下载数据

相关阅读