|||
解释:
此脚本的目的是寻找一些Cas9靶位点,他们要满足如下条件:
(1)只存在于某一条染色体上(修改后可变化),并且频数大于某一值。
(2)至少有一段靶位点在指定的区域。
返回的结果就是这些靶位点的序列,以及它对应的位置。
这个脚本是用作用cas9标记某一染色体的特定位置。
##############################################
import re
import random
import sys
#sys.argv[1] is "7qA1.txt" fasta file of chr7 qA1 seq
#sys.argv[2] is output filename
#
#read chr7
def readChrs(file):
Fchr = open(file,"r")
h = Fchr.readline()
chrom = h.strip()[1:]
l = Fchr.readline()
chrom = ""
print("Reading "+file+" .....")
while l != "":
chrom = chrom+l.strip()
l = Fchr.readline()
chrom = chrom.upper()
print("Read "+file+" finished")
return chrom
chr7 = readChrs("chr7.fa")
files = ["chr1.fa","chr2.fa","chr3.fa","chr4.fa","chr5.fa","chr6.fa",
"chr8.fa","chr9.fa","chr10.fa","chr11.fa","chr12.fa","chr13.fa",
"chr14.fa","chr15.fa","chr16.fa","chr17.fa","chr18.fa","chr19.fa","chrX.fa"]
nonchr7 =""
for F in files:
myS = readChrs(F)
nonchr7 = nonchr7+myS
#read chr7part
Part = open(sys.argv[1],"r")
h = Part.readline()
Pchr7 = ""
print("Reading YY .....")
l = Part.readline()
while l != "":
Pchr7 = Pchr7+l.strip()
l = Part.readline()
Pchr7 = Pchr7.upper()
print("Read YY finished")
#pattern
Pat1 = re.compile("[ACTG]{20}[ACTG]GG")
Pat2 = re.compile("CC[ACTG]{21}")
#serchRepeat
P1M = [(m.start(0),m.end(0)) for m in re.finditer(Pat1,Pchr7)]
P2M = [(m.start(0),m.end(0)) for m in re.finditer(Pat2,Pchr7)]
def myPattern(seq, strand):
myP = ""
if strand == "+":
myP = seq+"[ACTG]"+"GG"
else:
myP = "CC"+"[ACTG]"+seq
return myP
def searchPat(Pat,mySeq):
myL = [(m.start(0),m.end(0)) for m in re.finditer(Pat,mySeq)]
return myL
print("Length of P1M is "+str(len(P1M)))
myList = []
myMatchList = []
i = 1
for myM in P1M:
i = i+1
seq = Pchr7[(myM[0]):(myM[1]-3)]
seqs = seq+"+"
if seqs not in myList:
myList.append(seqs)
myP1 = myPattern(seq,"+")
myP2 = myPattern(seq,"-")
K1 = searchPat(myP1,chr7)
K2 = searchPat(myP2,chr7)
K = K1+K2
if len(K) > 6:
aK1 = re.search(myP1,nonchr7)
aK2 = re.search(myP2,nonchr7)
if aK1 == None and aK2 == None:
print(str(i)+"\t"+seqs)
l = []
for kl in K:
l.append(str(kl[0])+":"+str(kl[1]))
print l
myMatchList.append(seqs+"\t"+",".join(l))
print("Length of P1M is "+str(len(P2M)))
i = 1
for myM in P2M:
i = i+1
seq = Pchr7[(myM[0]+3):(myM[1])]
seqs = seq+"-"
if seqs not in myList:
myList.append(seqs)
myP1 = myPattern(seq,"-")
myP2 = myPattern(seq,"+")
K1 = searchPat(myP1,chr7)
K2 = searchPat(myP2,chr7)
K = K1+K2
if len(K) > 6:
aK1 = re.search(myP1,nonchr7)
aK2 = re.search(myP2,nonchr7)
if aK1 == None and aK2 == None:
print(str(i)+"\t"+seqs)
l = []
for kl in K:
l.append(str(kl[0])+":"+str(kl[1]))
print l
myMatchList.append(seqs+"\t"+",".join(l))
Fout = open(sys.argv[2],"w")
for l in myMatchList:
Fout.write(l)
##############################################
结果大体如下(从满足条件的靶位点中找到最佳的,设计gRNA):
1084 TCCTTAAAACTGCCTCCCTC+ ['3050687:3050710', '20084612:20084635', '20754617:20754640', '20971101:20971124', '21232498:21232521', '21597539:21597562', '21723276:21723299', '22080314:22080337', '22743730:22743753', '22745128:22745151', '22946475:22946498']
7435 TGCCTATAGTAGTAGCACAC+ ['3344756:3344779', '3344840:3344863', '3344868:3344891', '3344924:3344947', '3344952:3344975', '3344980:3345003', '3345319:3345342']
7445 TGCCTATAGTAGCAGCACAC+
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-12-4 20:51
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社