||
多次在我们的各种单细胞交流群看到了小伙伴们提问:审稿人希望大家把单细胞数据挖掘代码上传到github。其实这是一个很好的趋势,促进单细胞数据挖掘的可复现,虽然说对初学者要求可能会有点高。但是高标准严要求是能促进领域的进步和正规性。
而且不得不说,github在生物信息学领域的重要性,之前我们介绍过代码海洋,详见:《代码海洋-你想模仿的这里都有啊》,比如《单细胞天地》公众号分享的文献:单细胞转录组揭示肺腺癌特有的肿瘤微环境,就是配全部的代码。也有专门的github收集整理的更加齐全,而且还分门别类整理好了,详见:https://github.com/genecell/single-cell-papers-with-code
最近就看到了很简单的数据挖掘《Cellular hierarchy framework based on single-cell/multi-patient sample sequencing reveals metabolic biomarker PYGL as a therapeutic target for HNSCC》,可以作为一个很好的案例。大家以后可以参考这个:https://link.springer.com/article/10.1186/s13046-023-02734-w
这个数据挖掘作者把代码简单的整理到了,https://github.com/Mcdull8/METArisk/blob/main/METArisk_code.R
很简单的数据挖掘
这个文章的单细胞数据挖掘部分其实就是降维聚类分群后,有一个简单的生物学功能通路富集而已。
这样的单细胞转录组数据分析的标准降维聚类分群,并且进行生物学注释后的结果。可以参考前面的例子:人人都能学会的单细胞聚类分群注释 ,我们演示了第一层次的分群。如果你对单细胞数据分析还没有基础认知,可以看基础10讲:
这里我们简单的看看作者的代码里面的生物学亚群命名即可,可以看到其实是需要一个外部文件:./annotation/annotation_Mali.xlsx
# Cell Classification----
DefaultAssay(sce) <- "RNA"
all.markers <- FindAllMarkers(sce,
only.pos = TRUE,
min.pct = 0.1,
logfc.threshold = 0.25)
significant.markers <- all.markers[all.markers$p_val_adj < 0.05, ]
Idents(sce) <- sce$RNA_snn_res.1
annotation_curated_main <- read_excel("./annotation/annotation_main.xlsx")
new_ids_main <- annotation_curated_main$Main_cell_type
names(new_ids_main) <- levels(sce)
sce <- RenameIdents(sce, new_ids_main)
sce <- subset(sce,idents = "Other",invert = TRUE)
levels(sce) <- c("Malignant",
"Fibroblast",
"Endothelial",
"T",
"B_Plasma",
"Monocytic",
"Mast")
sce@meta.data$Main_cell_type <- Idents(sce)
这个其实跟我比较类似了,我喜欢肉眼看自己收集整理好的基因列表去人工给名字,比如前面我们系统性梳理了各种器官的上皮细胞的细分亚群,以及其对应的标记基因列表:
因为作者给出来了代码,所以我们可以很清晰的看到了作者单细胞数据挖掘里面的对肿瘤单细胞转录组数据处理的一些瑕疵,比如前面的亚群命名后,这里作者默认全部的上皮细胞都是恶性细胞,其实是有待商榷。
另外就是, 作者里面的真的每个个体的恶性肿瘤细胞,然后继续整合后降维聚类分群,这个整合步骤意思有问题的。
# Malignant Subset & METArisk----
Mali=sce[,sce$Main_cell_type %in% c('Malignant')]
Malilist <- SplitObject(Mali, split.by = "orig.ident")
Malilist
......
......
Idents(Mali) <- Mali$RNA_snn_res.0.8
annotation_curated_main <- read_excel("./annotation/annotation_Mali.xlsx")
new_ids_main <- annotation_curated_main$Cluster
names(new_ids_main) <- levels(Mali)
Mali <- RenameIdents(Mali, new_ids_main)
levels(Mali) <- c(paste0(rep("PrimaryC",5),1:5),
paste0(rep("MixedC",2),1:2),
paste0(rep("MetastaticC",2),1:2))
Mali@meta.data$Cluster <- Idents(Mali)
这个时候作者很有意思,前面的恶性肿瘤细胞按照病人样品整合了,所以接下来降维聚类分群后就需要结合前面的样品分组熟悉来做Pseudobulk,这个是亮点。
# Pseudobulk
Cell <- rep("Malignant",nrow(Mali@meta.data))
Mali$Cell <- Cell
DE <- run_de(Mali,replicate_col = "orig.ident", cell_type_col = "Cell", label_col = "Group",
de_family = "pseudobulk",
de_method = "edgeR")
logFC_cutoff <- 0.25
DE$change <- as.factor(ifelse(DE$p_val < 0.05 & abs(DE$avg_logFC) > logFC_cutoff,
ifelse(DE$avg_logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
DE$symbol <- DE$gene
s2e <- bitr(DE$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
DE <- inner_join(DE,s2e,by=c("symbol"="SYMBOL"))
# Functional Analysis
gene_up <- DE[DE$change %in% "UP","symbol"]
gene_down <- DE[DE$change %in% "DOWN","symbol"]
gene_up <- as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
gene_down <- as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
run_kegg <- function(gene_up,gene_down,geneList=F,pro='test'){
gene_up=unique(gene_up)
gene_down=unique(gene_down)
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
kk=kk.up
kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.up.csv'))
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
kk=kk.down
dotplot(kk)
kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.down.csv'))
}
run_kegg(gene_up,gene_down,pro=paste0("./Functional_Analysis/KEGG"))
Keggall <- read.gmt("./kegghsa.gmt") # download from GSEA database
Keggmeta <- filter(Keggall,Keggall$term %in% c("hsa01100_Metabolic_pathways")|str_detect(Keggall$term,"metabolism$"))
head(KEGG)
head(Keggmeta)
gene <- unique(as.vector(DE$symbol))
gene <- bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
gene_df <- data.frame(logFC= DE$avg_log2FC,
SYMBOL = DE$gene)
gene_df <- merge(gene_df,gene,by="SYMBOL")
geneList <- gene_df$logFC
names(geneList) <- gene_df$SYMBOL
geneList <- sort(geneList, decreasing = TRUE)
gsea <- GSEA(geneList,TERM2GENE = META_ACTIVATE)
这个都是常规代码了:
# AUCell
geneSets <- lapply(unique(META_ACTIVATE$term), function(x){Hallmarker$gene[META_ACTIVATE$term == x]})
names(geneSets) <- unique(META_ACTIVATE$term)
names(geneSets) <- "META_ACTIVATE"
exprMatrix <- as.matrix(Mali@assays$RNA@data)
cells_rankings <- AUCell_buildRankings(exprMatrix,splitByBlocks=TRUE)
cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings, aucMaxRank=nrow(cells_rankings)*0.1)
cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=T, assign=TRUE)
warningMsg <- sapply(cells_assignment, function(x) x$aucThr$comment)
warningMsg[which(warningMsg =="")]
sapply(cells_assignment, function(x) x$aucThr$selected)
selectedThresholds <- getThresholdSelected(cells_assignment)
cellsAssigned <- lapply(cells_assignment, function(x) x$assignment)
assignmentTable <- reshape2::melt(cellsAssigned, value.name="cell")
colnames(assignmentTable)[2] <- "geneSet"
head(assignmentTable)
assignmentMat <- table(assignmentTable[,"geneSet"], assignmentTable[,"cell"])
selectedThresholds <- getThresholdSelected(cells_assignment)
dat <- data.frame(Mali@meta.data, Mali@reductions$umap@cell.embeddings)
umap <- data.frame(Mali@reductions$umap@cell.embeddings)
umap <- as.matrix(umap)
plot(umap, cex=.3)
assignmentTable[1:4,]
for(geneSetName in names(selectedThresholds)){
colorPal_Neg <- grDevices::colorRampPalette(c("black","blue", "skyblue"))(5)
colorPal_Pos <- grDevices::colorRampPalette(c("pink", "magenta", "red"))(5)
passThreshold <- getAUC(cells_AUC)[geneSetName,] > selectedThresholds[geneSetName]
if(sum(passThreshold) >0 ) {
aucSplit <- split(getAUC(cells_AUC)[geneSetName,], passThreshold)
cellColor <- c(setNames(colorPal_Neg[cut(aucSplit[[1]], breaks=5)], names(aucSplit[[1]])),
setNames(colorPal_Pos[cut(aucSplit[[2]], breaks=5)], names(aucSplit[[2]])))
plot(umap, main=geneSetName,
sub="Pink/red cells pass the threshold",
col=cellColor[rownames(umap)], pch=16)
}
}
par(mfrow=c(1,1))
AUCell_plotTSNE(tSNE=umap, exprMat=exprMatrix,
cellsAUC=cells_AUC[c(1),], thresholds=selectedThresholds[c(1)])
geneSet <- "META_ACTIVATE"
aucs <- as.numeric(getAUC(cells_AUC)[geneSet, ])
Mali$AUC <- aucs
par(mfrow=c(1,1))
umap <- data.frame(Mali@meta.data, Mali@reductions$umap@cell.embeddings)
后面还有Cibersortx以及预后模型构建,药物预测后差异情况展现,这个数据挖掘作者把代码简单的整理到了,https://github.com/Mcdull8/METArisk/blob/main/METArisk_code.R
借花献佛给大家。
Git和GitHub在生物信息学中有许多优点,包括但不限于以下几点:
总的来说,Git和GitHub是生物信息学中不可或缺的工具,它们可以帮助你更有效地管理和共享你的工作。
GitHub在生物信息学领域中的重要性主要体现在以下几个方面:
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-12-27 08:04
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社