-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLigPred_lvl5.Rmd
613 lines (524 loc) · 30.6 KB
/
LigPred_lvl5.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
---
title: "Differentiating ligand-perturbed transcriptomes <br> by training on all cell types"
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
library(cmapR)
library(ranger)
library(colorspace)
library(kableExtra)
```
```{r load_data_lvl5, message=FALSE, warning=FALSE, include=FALSE}
if (exists("lvl5_data")) {
} else if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")) {
load("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")
} else {
source("lvl5_inputs.R")
}
```
Since the replicate-aggregated Z-scores incorporate both ligand-treated and untreated transcriptomes into a single difference measure, testing for the ability to distinguish between treatment and control is not possible in this data. Instead, the random forest model will be trained to distinguish changes in transcriptome caused by different ligand treatments across a mix of all cell lines. This is done by training the model on a random sample of all 15 ligand treatments across all 14 cell types, with ligand labels provided. The distribution of training and test data is shown in the table below.
[Random forest model source code](https://github.com/BaderLab/Brendan_CCInxPred/blob/master/200427_lvl5_mixall.R)
```{r RF_unbalanced,echo=FALSE,message=FALSE,warning=FALSE,fig.height=7,fig.width=7,fig.show='hold'}
if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall.RData")) {
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall.RData")
} else {
source("200427_lvl5_mixall.R")
}
tempdf <- as.data.frame(
rbind(training=table(lvl5_data@cdesc[trainIDs,"pert_iname"]),
testing=table(lvl5_data@cdesc[testIDs,"pert_iname"]),
total=table(lvl5_data@cdesc[c(trainIDs,testIDs),"pert_iname"]))
)
kable(tempdf,format="html") %>%
kable_styling()
temp_confusion <- table(true=lvl5_data@cdesc[testIDs,"pert_iname"],
predicted=rfresults$predictions)
temp_acc <- sweep(temp_confusion,1,rowSums(temp_confusion),"/")
par(mar=c(1,5,5,1),mgp=2:0,las=2)
image(z=t(temp_acc * 100)[,seq(nrow(temp_acc),1)],
x=1:nrow(temp_acc),y=1:ncol(temp_acc),
col=sequential_hcl(100,palette="inferno",rev=T),breaks=0:100,
xaxt="n",yaxt="n",xlab=NA,ylab=NA)
mtext(rev(colnames(temp_acc)),side=2,at=1:nrow(temp_acc),adj=1.1)
mtext("Truth",side=2,line=3.5,at=((nrow(temp_acc)-1)/2)+1,las=0,font=2,cex=1.5)
mtext(rownames(temp_acc),side=3,at=1:nrow(temp_acc),adj=-0.1)
mtext("Prediction",side=3,line=3.5,at=((nrow(temp_acc)-1)/2)+1,las=0,font=2,cex=1.5)
text(x=seq(1,nrow(temp_acc)),y=seq(nrow(temp_acc),1),
labels=round(diag(temp_acc),2),font=2,col="dodgerblue")
acc_unbalanced <- diag(temp_acc)
rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```
**Average accuracy was `r paste0(round(mean(acc_unbalanced) * 100,2),"%")`.** This is perhaps unexpectedly bad, given that the model is seeing all the data, not trying to extrapolate to withheld cell lines.
Clearly the fact that EGF was tested more than the other ligands caused an imbalance in the training data, leading to a bias towards classifying samples as EGF-treated.
# Balanced training data
To address the balancing issue, the training set was adjusted to ensure equal sampling of all ligands (leaving the remainder as the *unbalanced* test set). The distribution of training and test data is shown in the table below.
[Random forest model source code](https://github.com/BaderLab/Brendan_CCInxPred/blob/master/200427_lvl5_mixall.R)
```{r RF_balanced,echo=FALSE,message=FALSE,warning=FALSE,fig.height=7,fig.width=9,fig.show='hold'}
if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced.RData")) {
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced.RData")
} else {
source("200427_lvl5_mixall.R")
}
tempdf <- as.data.frame(
rbind(training=table(lvl5_data@cdesc[trainIDs,"pert_iname"]),
testing=table(lvl5_data@cdesc[testIDs,"pert_iname"]),
total=table(lvl5_data@cdesc[c(trainIDs,testIDs),"pert_iname"]))
)
kable(tempdf,format="html") %>%
kable_styling()
temp_confusion <- table(true=lvl5_data@cdesc[testIDs,"pert_iname"],
predicted=rfresults$predictions)
temp_acc <- sweep(temp_confusion,1,rowSums(temp_confusion),"/")
layout(rbind(1:2),widths=c(7,2))
par(mar=c(1,5,5,1),mgp=2:0,las=2)
image(z=t(temp_acc * 100)[,seq(nrow(temp_acc),1)],
x=1:nrow(temp_acc),y=1:ncol(temp_acc),
col=sequential_hcl(100,palette="inferno",rev=T),breaks=0:100,
xaxt="n",yaxt="n",xlab=NA,ylab=NA)
mtext(rev(colnames(temp_acc)),side=2,at=1:nrow(temp_acc),adj=1.1)
mtext("Truth",side=2,line=3.5,at=((nrow(temp_acc)-1)/2)+1,las=0,font=2,cex=1.5)
mtext(rownames(temp_acc),side=3,at=1:nrow(temp_acc),adj=-0.1)
mtext("Prediction",side=3,line=3.5,at=((nrow(temp_acc)-1)/2)+1,las=0,font=2,cex=1.5)
text(x=seq(1,nrow(temp_acc)),y=seq(nrow(temp_acc),1),
labels=round(diag(temp_acc),2),font=2,col="dodgerblue")
acc_balanced <- diag(temp_acc)
par(mar=c(1,1,5,3),mgp=2:0,las=0,bty="n")
boxplot(list(Unbalanced=acc_unbalanced,
Balanced=acc_balanced),
ylim=c(0,1),names=NA,xaxt="n",yaxs="i",yaxt="n")
axis(side=4); mtext("Accuracy",side=4,line=2,at=0.5)
mtext(c("Unbalanced","Balanced"),side=3,at=c(1,2),las=2,font=2)
#rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```
**Average accuracy was `r paste0(round(mean(acc_balanced) * 100,2),"%")`.** This is still worse than one might expect. Given that this is serving as the positive control for the use of the level 5 data, I'd like to explore why its working well for a few ligands, and poorly for the rest.
# Data saturation
[Random forest model source code](https://github.com/BaderLab/Brendan_CCInxPred/blob/master/200427_lvl5_mixall.R)
```{r RF_saturated,echo=FALSE,message=FALSE,warning=FALSE,fig.height=3,fig.width=7,fig.show='hold'}
if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced_saturated.RData")) {
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced_saturated.RData")
} else {
source("200427_lvl5_mixall.R")
}
temp_confusion <- list()
temp_acc <- list()
for (N in seq_along(rfresults)) {
temp_confusion[[N]] <- table(true=lvl5_data@cdesc[testIDs[[N]],"pert_iname"],
predicted=rfresults[[N]]$predictions)
temp_acc[[N]] <- sweep(temp_confusion[[N]],1,rowSums(temp_confusion[[N]]),"/")
}
acc_saturating <- sapply(temp_acc,diag)
par(mar=c(3,3,1,1),mgp=2:0)
boxplot(acc_saturating,pch=".",
ylab="Accuracy per ligand",
xlab="Number of training samples per ligand")
points(1:ncol(acc_saturating),colMeans(acc_saturating),
type="l",lwd=2,col="red")
abline(h=max(colMeans(acc_saturating)),lty=2,col="red")
legend("topleft",legend="Mean accuracy",
bty="n",lwd=2,col="red",cex=0.8)
rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```
Training the random forest model was repeated with increasing data, from one sample per ligand (randomly sampled from the 14 cell types) to 98 samples per ligand (all but one sample for most ligands, except EGF which has 195 samples).
Improvements in accuracy from increased training data level off after half the data is used, as was done in the previous models.
# Ligand-specific differences in accuracy
Below are UMAP projections of all cells used in these models, coloured by cell type with each ligand treatment highlighted. The ligands are ordered by the accuracy of their prediction, and plots are outlined per the colour scheme above.
```{r UMAP_ct,echo=FALSE,message=FALSE,warning=FALSE,fig.height=9,fig.width=9,fig.show='hold'}
if (exists("lvl5_umap_lig16")) {
temp <- NULL
} else if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200701_lvl5_umap_lig16.RData")) {
load("~/Dropbox/GDB_archive/CMapCorr_files/200701_lvl5_umap_lig16.RData")
} else {
source("200701_lvl5_umap.R")
}
temp_allcol <- as.factor(lvl5_data@cdesc[rownames(lvl5_umap_lig16$layout),"cell_id"])
temp_boxcol <- cut(acc_balanced * 100, breaks=0:100,labels=F)
names(temp_boxcol) <- names(acc_balanced)
par(mfrow=c(4,4),mar=c(1,1,1,0.5),mgp=c(0,0,0))
for (L in names(sort(acc_balanced,decreasing=T))) {
plot(lvl5_umap_lig16$layout,pch=".",cex=2,
xaxt="n",yaxt="n",xlab="UMAP1",ylab="UMAP2",
main=paste0(L," (",round(acc_balanced[L] * 100,1),"%)"),
col=qualitative_hcl(length(levels(temp_allcol)),
palette="dark3",alpha=.7)[temp_allcol])
temp_col <- temp_allcol[lvl5_data@cdesc[rownames(lvl5_umap_lig16$layout),"pert_iname"] == L]
points(lvl5_umap_lig16$layout[lvl5_data@cdesc[rownames(lvl5_umap_lig16$layout),"pert_iname"] == L,],
pch=19,col=qualitative_hcl(length(levels(temp_col)),
palette="dark3",alpha=1)[temp_col])
box(col=sequential_hcl(100,palette="inferno",rev=T)[temp_boxcol[L]])
}
rm(list=c("L",grep("^temp",ls(),value=T)))
```
It is clear from this view that the model learns the transcriptional changes caused by the ligand well when those changes are consistent across a majority of cell lines, as with TNF and IFNG.
Are there specific genes marking these changes? Here are the mean and standard deviations of z-scores per gene for each ligand treatment across all cell lines. *Note that this only included measured "landmark" genes.* Genes with mean Z-score magnitudes larger than the standard deviation of their Z-scores are highlighted.
```{r lig_zscores,echo=FALSE,message=FALSE,warning=FALSE,fig.height=5,fig.width=7,fig.show='hold'}
temp_mean <- sapply(names(sort(acc_balanced,decreasing=T)),function(L)
rowMeans(lvl5_data@mat[,lvl5_data@cdesc$pert_iname == L]),
simplify=T)
temp_sd <- sapply(names(sort(acc_balanced,decreasing=T)),function(L)
apply(lvl5_data@mat[,lvl5_data@cdesc$pert_iname == L],1,sd),
simplify=T)
temp_meansd <- sapply(colnames(temp_mean),function(X) abs(temp_mean[,X]) > temp_sd[,X])
temp_col <- matrix(cut(abs(temp_meansd),breaks=100,labels=F),
nrow=nrow(temp_meansd),ncol=ncol(temp_meansd),
dimnames=list(rownames(temp_meansd),
colnames(temp_meansd)))
temp_ct <- as.factor(sapply(colnames(temp_mean),rep,times=nrow(temp_mean)))
par(mar=c(3,3,2,1),mgp=2:0)
plot(NA,NA,xlim=range(temp_mean),ylim=range(temp_sd),
xlab="Mean Z-score",ylab="SD of Z-scores",
main="Gene expression Z-scores per ligand treatment across all cell types")
points(as.vector(temp_mean)[!as.vector(temp_meansd)],
as.vector(temp_sd)[!as.vector(temp_meansd)],
pch=".",cex=2,
col=qualitative_hcl(ncol(temp_mean),
palette="dark3",
alpha=0.5)[temp_ct[!as.vector(temp_meansd)]])
points(as.vector(temp_mean)[as.vector(temp_meansd)],
as.vector(temp_sd)[as.vector(temp_meansd)],
pch=20,col=qualitative_hcl(ncol(temp_mean),
palette="dark3")[temp_ct[as.vector(temp_meansd)]])
legend("bottomright",bty="n",pch=20,
title="Mean / SD > 1",
legend=unique(temp_ct[as.vector(temp_meansd)]),
col=qualitative_hcl(ncol(temp_mean),
palette="dark3")[unique(temp_ct[as.vector(temp_meansd)])])
text(scClustViz::spreadLabels2(
as.vector(temp_mean)[as.vector(temp_meansd)],
as.vector(temp_sd)[as.vector(temp_meansd)],
lvl5_data@rdesc[rep(rownames(temp_sd),ncol(temp_sd))[as.vector(temp_meansd)],"pr_gene_symbol"],
str.cex=0.8),cex=0.8,
labels=lvl5_data@rdesc[rep(rownames(temp_sd),ncol(temp_sd))[as.vector(temp_meansd)],"pr_gene_symbol"],
col=qualitative_hcl(ncol(temp_sd),palette="dark3")[temp_ct[as.vector(temp_meansd)]])
# par(mfrow=c(5,3),mar=c(3,3,1,1),mgp=2:0)
# for (L in colnames(temp_mean)) {
# plot(temp_mean[,L],temp_sd[,L],pch=20,
# xlab="mean",ylab="sd",main=L,
# col=sequential_hcl(100,palette="inferno",rev=T)[temp_col[,L]])
# temp_top <- rownames(temp_col)[which.max(temp_col[,L])]
# text(temp_mean[temp_top,L],temp_sd[temp_top,L],
# labels=lvl5_data@rdesc[temp_top,"pr_gene_symbol"],
# col="dodgerblue",pos=2)
# }
rm(list=grep("^temp",ls(),value=T))
```
Given that nearly all highlighted genes are from the two most predictable ligand treatments, it seems that the model performs best when the ligand-induced changes in gene expression are common across all cell lines.
The following figure shows classification accuracy per cell (top row, darker indicates higher frequency of correct classification) and Z-scores of a selection of genes that strongly respond to the indicated ligand (from the above figure).
```{r RF_leave1out,echo=FALSE,message=FALSE,warning=FALSE}
if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200524_lvl5_mixall_balanced_leave1out_results.RData")) {
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/200524_lvl5_mixall_balanced_leave1out_results.RData")
} else {
source("200524_lvl5_mixall_leave1out.R")
}
test_results <- data.frame(
IDs=as.vector(testIDs),
Labels=as.vector(t(sapply(rfresults,function(X) X$predictions))),
stringsAsFactors=F
)
test_hits <- vector("list",sum(lvl5_data@cdesc$pert_iname %in% lig16))
names(test_hits) <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname %in% lig16]
for (X in 1:nrow(test_results)) {
test_hits[[test_results[X,"IDs"]]] <- append(test_hits[[test_results[X,"IDs"]]],
test_results[X,"Labels"])
}
acc_leave1out <- sapply(names(test_hits),function(X)
sum(test_hits[[X]] == lvl5_data@cdesc[X,"pert_iname"]) /
length(test_hits[[X]]))
acc_leave1out_totals <- sapply(lig16,function(X) {
Y <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == X]
return(sum(unlist(test_hits[Y]) == X) /
length(unlist(test_hits[Y])))
})
acc_leave1out_ct <- sapply(lig16,function(X) {
sapply(ct14,function(Y) {
Z <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == X &
lvl5_data@cdesc$cell_id == Y]
return(sum(unlist(test_hits[Z]) == X) /
length(unlist(test_hits[Z])))
})
})
test_counts <- sapply(lig16,function(X) {
sapply(ct14,function(Y) {
sum(lvl5_data@cdesc$pert_iname == X &
lvl5_data@cdesc$cell_id == Y)
})
})
test_predictions <- sapply(lig16,function(X) {
sapply(ct14,function(Y) {
temp <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == X &
lvl5_data@cdesc$cell_id == Y]
return(table(factor(unlist(test_hits[temp]),levels=lig16)) /
length(unlist(test_hits[temp])))
})
},simplify=F)
rm(X)
```
```{r UMAP_marker_gene,echo=FALSE,message=FALSE,warning=FALSE,fig.height=8.5,fig.width=9,fig.show='hold'}
if (exists("lvl5_umap_lig16")) {
temp <- NULL
} else if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200701_lvl5_umap_lig16.RData")) {
load("~/Dropbox/GDB_archive/CMapCorr_files/200701_lvl5_umap_lig16.RData")
} else {
source("200701_lvl5_umap.R")
}
temp_allcol <- cut(acc_leave1out * 100,breaks=0:100,labels=F,include.lowest=T)
names(temp_allcol) <- names(acc_leave1out)
temp_allcol <- temp_allcol[rownames(lvl5_umap_lig16$layout)]
layout(rbind(1:4,matrix(5:12,ncol=4),matrix(13:20,ncol=4)),
heights=c(3,3,1,3,1))
par(mar=c(1.5,1,1,0.5),mgp=c(0,0,0))
for (L in c("TNF","IFNG","BTC","TGFA")) {
plot(NA,NA,xlim=range(lvl5_umap_lig16$layout[,1]),ylim=range(lvl5_umap_lig16$layout[,2]),
xaxt="n",yaxt="n",xlab="UMAP1",ylab="UMAP2",
main=paste0(L," (",round(acc_leave1out_totals[L] * 100,1),"%)"))
points(lvl5_umap_lig16$layout[lvl5_data@cdesc[rownames(lvl5_umap_lig16$layout),"pert_iname"] != L,],
pch=".",cex=2,col=sequential_hcl(100,palette="inferno",rev=T,alpha=0.7)[temp_allcol])
temp_col <- temp_allcol[lvl5_data@cdesc[names(temp_allcol),"pert_iname"] == L]
points(lvl5_umap_lig16$layout[lvl5_data@cdesc[rownames(lvl5_umap_lig16$layout),"pert_iname"] == L,],
pch=19,col=sequential_hcl(100,palette="inferno",rev=T,alpha=0.7)[temp_col])
}
temp_genes <- c("NFKBIA","STAT1","DUSP4","IER3","RELB","PSMB8","ICAM1")
temp_lig <- list("TNF","IFNG","BTC","TGFA","TNF","IFNG",c("TNF","IFNG"))
par(mar=c(1,1,1,0.5),mgp=c(0,0,0))
temp_par <- par(c("mar","mgp","bty","las"))
for (N in seq_along(temp_genes)) {
temp_z <- lvl5_data@mat[
rownames(lvl5_data@rdesc)[lvl5_data@rdesc$pr_gene_symbol == temp_genes[N]],
rownames(lvl5_umap_lig16$layout)
]
temp_down <- cut(c(0,max(abs(range(temp_z))) * -1,temp_z[temp_z <= 0]),
50,labels=F)[c(-1,-2)]
temp_up <- cut(c(0,max(abs(range(temp_z))),temp_z[temp_z > 0]),
50,labels=F)[c(-1,-2)]
temp_col <- rep(NA, length(temp_z))
temp_col[temp_z <= 0] <- temp_down
temp_col[temp_z > 0] <- temp_up + 50
temp_big <- rownames(lvl5_umap_lig16$layout) %in%
rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname %in% temp_lig[[N]]]
par(temp_par)
plot(NA,NA,xlim=range(lvl5_umap_lig16$layout[,1]),ylim=range(lvl5_umap_lig16$layout[,2]),
xaxt="n",yaxt="n",xlab="UMAP1",ylab="UMAP2",main=temp_genes[N])
points(lvl5_umap_lig16$layout[!temp_big,],pch=20,
col=colorspace::diverge_hcl(100, palette = "Purple-Green",alpha=.8)[temp_col])
points(lvl5_umap_lig16$layout[temp_big,],pch=19,
col=colorspace::diverge_hcl(100, palette = "Purple-Green",alpha=.8)[temp_col][temp_big])
temp_bp <- list(
Rest=temp_z[!names(temp_z) %in%
rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname %in% temp_lig[[N]]]]
)
temp_bp <- append(temp_bp,
sapply(temp_lig[[N]],function(X)
temp_z[names(temp_z) %in%
rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname %in% X]],
simplify=F))
names(temp_bp)[names(temp_bp) == ""] <- temp_lig[[N]]
par(mar=c(3.5,3,0,0.5),mgp=2:0,bty="n",las=1)
boxplot(temp_bp,horizontal=T,yaxt="n",xaxs="i",xaxt="n")
mtext(names(temp_bp),side=2,at=seq_along(temp_bp),cex=0.8)
temp_leg <- round(seq(min(temp_col),max(temp_col),length.out=1000) * 10)
segments(xpd=NA,lwd=1,y0=rep(par("usr")[3],1000),
y1=rep(par("usr")[3] - (0.11 * (par("usr")[4] - par("usr")[3])),1000),
x0=seq(min(unlist(temp_bp)),max(unlist(temp_bp)),length.out=1000),
x1=seq(min(unlist(temp_bp)),max(unlist(temp_bp)),length.out=1000),
col=colorspace::diverge_hcl(1000, palette = "Purple-Green")[temp_leg])
axis(1)
mtext("Z-score",side=1,cex=0.8,line=2)
}
rm(list=c("N","L",grep("^temp",ls(),value=T)))
```
The model seems to do well when there are strong marker genes to aid in the classification, but a single weak marker gene like *DUSP4* that has occasional high differential expression out of class is not sufficient to accurately classify ligand-treated cells in multiple cell types.
# Cell-type biases in accuracy
Are there differences in the model's ability to classify ligand treatments between cell types? In order to ask this, the model was repeatedly trained on a leave-one-out basis per ligand treatment such that each sample in a ligand treatment was tested for model classification accuracy at least 10 times.
This data was used to colour the UMAP projections in the top row of the previous figure, and results are summarized in the matrix below, with darker colour representing higher rates of correct classification (ranging from 0% to 100% accuracy) across >10 tests for each sample. The number of samples of each ligand tested in a cell type is indicated in blue.
[Random forest model source code](https://github.com/BaderLab/Brendan_CCInxPred/blob/master/200524_lvl5_mixall_leave1out.R)
```{r leave1out_acc_ct,echo=FALSE,message=FALSE,warning=FALSE,fig.height=5,fig.width=7,fig.show='hold'}
# see {RF_leave1out} for input code for this figure.
par(mar=c(2,13,5,1),mgp=2:0,las=2)
image(z=t(acc_leave1out_ct * 100)[,seq(nrow(acc_leave1out_ct),1)],
x=1:ncol(acc_leave1out_ct),y=1:nrow(acc_leave1out_ct),
col=sequential_hcl(100,palette="inferno",rev=T),breaks=0:100,
xaxt="n",yaxt="n",xlab=NA,ylab=NA)
mtext(colnames(acc_leave1out_ct),side=3,at=1:ncol(acc_leave1out_ct),adj=0,line=0.1)
mtext("Ligands",side=3,line=3.5,at=((ncol(acc_leave1out_ct)-1)/2)+1,las=0,font=2,cex=1.5)
mtext(rev(rownames(acc_leave1out_ct)),
side=2,at=1:nrow(acc_leave1out_ct),adj=1,line=0.1)
mtext("Cell Types",side=3,line=0.1,at=0.2,adj=1,las=0,font=2,cex=1.5)
text(as.vector(sapply(1:ncol(test_counts),function(X) rep(X,nrow(test_counts)))),
as.vector(sapply(1:ncol(test_counts),function(X) nrow(test_counts):1)),
labels=as.vector(test_counts),cex=0.8,col="dodgerblue")
mtext(paste("Sample","number",sep="\n"),at=ncol(acc_leave1out_ct) + 0.5,
side=1,line=1,adj=1,las=0,col="dodgerblue")
segments(x0=seq(2.5,11.5,length.out=1000),
x1=seq(2.5,11.5,length.out=1000),
y0=rep(-0.1,1000),y1=rep(0.3,1000),
xpd=NA,col=sequential_hcl(1000,palette="inferno",rev=T))
mtext(c("0%","100%"),las=0,side=1,at=c(2.5,11.5),adj=c(1.1,-0.1))
mtext("Accuracy",las=0,side=1,line=0.9,at=7,adj=0.5)
```
The ligands were tested in multiple contexts (time / dose) in the breast tumour lines, so they have more training data available per cell line. This may improve classification tasks by allowing the model to learn cell-type specific gene expression changes. However, some ligands (eg. FGF1, EGF) proved challenging to predict even in breast lines with increased sampling.
## Cell type misclassification biases
This figure shows the distribution of predicted ligands from the experiment above, in order to determine whether ligands were misclassified in a consistent manner.
```{r leave1out_class_ct,echo=FALSE,message=FALSE,warning=FALSE,fig.height=5,fig.width=9,fig.show='hold'}
# sapply(colnames(test_predictions[["IL17A"]]),function(X)
# sort(test_predictions[["IL17A"]][,X],decreasing=T),simplify=F)
temp_col <- unlist(lapply(qualitative_hcl(ceiling(length(lig16) / 2),palette="dark3"),
function(X) rep(X,2)))[1:length(lig16)]
names(temp_col) <- lig16
layout(rbind(c(17,1:16)),widths=c(2,rep(1,16)))
par(mar=c(0,0.1,5,0.1),mgp=2:0,las=2)
for (LIG in lig16) {
temp <- barplot(test_predictions[[LIG]][,rev(seq_along(ct14))],axes=F,space=0,horiz=T,
las=3,col=temp_col,density=c(NA,40),angle=45,las=2,axisnames=F)
mtext(LIG,side=3,line=0,font=2)#,col=temp_col[LIG])
rect(0.25,14.1,0.75,14.5,xpd=NA,col=temp_col[LIG],border="black",
density=switch(as.character(which(lig16 == LIG) %% 2),'1'=NA,'0'=40))
temp2 <- par("usr")
}
par(mar=c(0,0,5,0))
plot(NA,NA,xlim=0:1,ylim=temp2[3:4],yaxs="i",bty="n",xaxt="n",yaxt="n")
mtext(rev(ct14),side=4,line=-0.5,at=temp,las=2,adj=1,font=2)
```
The exemplar of consistent misclassification is IL17A in VCAP prostate tumour line, where in repeated leave-one-out tests its transcriptional changes were predicted to be due to TNF treatment 100% of the time. By inspecting the most differentially expressed genes in these transcriptomes, we may be able to explain why. In the case of IL17A-treated VCAP cells, ICAM1 and CCL2 are the most positively differentially expressed genes, as seen in the IL17A heatmap below. These genes are also consistently strong markers of TNF treatment in all cell lines, as seen in the volcano plot earlier.
## Ligand response genes per cell type
```{r lig_genes_per_ct,echo=FALSE,message=FALSE,warning=FALSE}
if (!file.exists("docs/output_figs/TNF.png")) {
for (LIG in lig16) {
temp_lig <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == LIG &
lvl5_data@cdesc$cell_id %in% ct14]
temp_ct <- factor(lvl5_data@cdesc[temp_lig,"cell_id"],levels=ct14)
levels(temp_ct) <- names(ct14)
tempZ <- apply(lvl5_data@mat[,temp_lig],1,function(X) lm(X ~ 0 + temp_ct))
tempZM <- t(sapply(tempZ,function(X) summary(X)$coefficients[,1]))
tempZT <- t(sapply(tempZ,function(X) summary(X)$coefficients[,3]))
colnames(tempZM) <- colnames(tempZT) <- sub("temp_ct","",colnames(tempZT))
tempZTtophits <- apply(tempZT,2,function(X) names(X)[order(abs(X),decreasing=T)][1:10])
tempZTtop <- tempZT[unique(as.vector(tempZTtophits)),]
tempZTtop <- tempZTtop[hclust(dist(tempZTtop),"complete")$order,
hclust(dist(t(tempZTtop)),"complete")$order]
temp_col <- as.vector(tempZTtop)
temp_down <- cut(c(0,max(abs(range(temp_col))) * -1,temp_col[temp_col <= 0]),
50,labels=F)[c(-1,-2)]
temp_up <- cut(c(0,max(abs(range(temp_col))),temp_col[temp_col > 0]),
50,labels=F)[c(-1,-2)]
temp_heat <- rep(NA,length(temp_col))
temp_heat[temp_col <= 0] <- temp_down
temp_heat[temp_col > 0] <- temp_up + 50
temp_heat <- matrix(temp_heat,ncol=14)
colnames(temp_heat) <- colnames(tempZTtop)
rownames(temp_heat) <- lvl5_data@rdesc[rownames(tempZTtop),"pr_gene_symbol"]
temp_heat <- t(temp_heat)
png(paste0("docs/output_figs/",LIG,".png"),width=10,height=5,units="in",res=300)
par(mar=c(0.5,13.5,4.5,0.5),mgp=2:0,las=2)
image(z=t(temp_heat)[,seq(nrow(temp_heat),1)],
x=1:ncol(temp_heat),y=1:nrow(temp_heat),
col=colorspace::diverge_hcl(100, palette = "Purple-Green"),
breaks=0:100,xaxt="n",yaxt="n",xlab=NA,ylab=NA)
mtext(colnames(temp_heat),side=3,at=1:ncol(temp_heat),adj=0,line=0.1,cex=0.5)
rect(rep(par("usr")[1] - 1.5,nrow(temp_heat)),
seq(par("usr")[4] - 1,par("usr")[1],-1),
rep(par("usr")[1] - 0.5,nrow(temp_heat)),
seq(par("usr")[4],par("usr")[1] + 1,-1),
xpd=NA,border=NA,
col=sequential_hcl(100,palette="inferno",rev=T)[
cut(c(0,1,acc_leave1out_ct[rownames(temp_heat),LIG]),
breaks=100,labels=F)[-1:-2]
])
mtext(rev(paste0(rownames(temp_heat)," (",test_counts[rownames(temp_heat),LIG],")")),
side=2,at=1:nrow(temp_heat),adj=1,line=1)
mtext(paste(paste(LIG,"Z-score"),"T-statistics",sep="\n"),
side=3,line=1.5,at=par("usr")[1] - par("cxy")[1] * 3,adj=1,las=0,font=2,cex=1.5)
segments(x0=seq(par("usr")[1] - par("cxy")[1] * 3 - strwidth(paste(LIG,"Z-score"),cex=1.5,font=2),
par("usr")[1] - par("cxy")[1] * 3,
length.out=500),
x1=seq(par("usr")[1] - par("cxy")[1] * 3 - strwidth(paste(LIG,"Z-score"),cex=1.5,font=2),
par("usr")[1] - par("cxy")[1] * 3,
length.out=500),
y0=rep(par("usr")[4] + par("cxy")[2] * 1.2,500),
y1=rep(par("usr")[4] + par("cxy")[2] * 1.5,500),
xpd=NA,lwd=2,col=colorspace::diverge_hcl(500, palette = "Purple-Green"))
text(c(par("usr")[1] - par("cxy")[1] * 3,
par("usr")[1] - par("cxy")[1] * 3 - strwidth(paste(LIG,"Z-score"),cex=1.5,font=2),
par("usr")[1] - par("cxy")[1] * 3 - strwidth(paste(LIG,"Z-score"),cex=1.5,font=2) * 0.5),
rep(par("usr")[4] + par("cxy")[2] * 1.2,3),
labels=c(round(max(abs(tempZTtop)),1),
round(max(abs(tempZTtop)),1) * -1,
0),
pos=1,xpd=NA,cex=0.8)
dev.off()
png(paste0("docs/output_figs/",LIG,"_sm.png"),width=220,height=110,units="px")
par(mar=c(.2,.5,.2,.2),mgp=2:0,las=2)
image(z=t(temp_heat)[,seq(nrow(temp_heat),1)],
x=1:ncol(temp_heat),y=1:nrow(temp_heat),
col=colorspace::diverge_hcl(100, palette = "Purple-Green"),
breaks=0:100,xaxt="n",yaxt="n",xlab=NA,ylab=NA)
rect(rep(par("usr")[1] - 1.5,nrow(temp_heat)),
seq(par("usr")[4] - 1,par("usr")[1],-1),
rep(par("usr")[1] - 0.5,nrow(temp_heat)),
seq(par("usr")[4],par("usr")[1] + 1,-1),
xpd=NA,border=NA,
col=sequential_hcl(100,palette="inferno",rev=T)[
cut(c(0,1,acc_leave1out_ct[rownames(temp_heat),LIG]),
breaks=100,labels=F)[-1:-2]
])
text(par("usr")[1] + (par("usr")[2] - par("usr")[1]) * 0.5,
par("usr")[3] + (par("usr")[4] - par("usr")[3]) * 0.5,
labels=LIG,font=2,cex=3.5)
dev.off()
}
}
```
To determine which genes had Z-scores (treated vs. control) with consistently high magnitudes within a cell type, one-way analyses of variance were performed with cell types as the independent variable and each gene's Z-scores as the dependent variable. The T-statistic from each coefficient was used to summarize the Z-score distributions for each gene. The ten T-statistics with the highest magnitude per cell type are shown in the heatmaps below, with cell type prediction accuracies from above included as coloured bars next to each row. **Click below for full-size image**.
```{r lig_genes_per_ct_out,echo=F,results="asis"}
temp <- matrix(lig16,nrow=4,byrow=T)
for (L in 1:nrow(temp)) {
cat("<p>")
for (LIG in temp[L,]) {
cat(paste0('<a href="output_figs/',LIG,'.png">',
'<img src="output_figs/',LIG,'_sm.png"/>',
'</a>'))
}
cat("</p>")
cat("\n")
}
```
# Metadata features
Rather than let the model implicitly learn metadata characteristics such as cell line, treatment duration, and dosage, perhaps it would improve accuracy if these were explicitly provided as features.
```{r RF_saturated_metadata,echo=FALSE,message=FALSE,warning=FALSE,fig.height=3,fig.width=9,fig.show='hold'}
if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/200703_lvl5_mixall_balanced_saturated_metadata.RData")) {
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/200703_lvl5_mixall_balanced_saturated_metadata.RData")
} else {
source("200703_lvl5_mixall_metadata.R")
}
temp_confusion <- list()
temp_acc <- list()
for (N in seq_along(rfresults)) {
temp_confusion[[N]] <- table(true=lvl5_data@cdesc[testIDs[[N]],"pert_iname"],
predicted=rfresults[[N]]$predictions)
temp_acc[[N]] <- sweep(temp_confusion[[N]],1,rowSums(temp_confusion[[N]]),"/")
}
acc_saturating_metadata <- sapply(temp_acc,diag)
layout(cbind(1,2),widths=c(7,2))
par(mar=c(3,3,1,1),mgp=2:0)
boxplot(acc_saturating_metadata,pch=".",
ylab="Accuracy per ligand",
xlab="Number of training samples per ligand")
points(1:ncol(acc_saturating_metadata),colMeans(acc_saturating_metadata),
type="l",lwd=2,col="red")
abline(h=max(colMeans(acc_saturating_metadata)),lty=2,col="red")
legend("topleft",legend="Mean accuracy",
bty="n",lwd=2,col="red",cex=0.8)
boxplot(list("Expr"=colMeans(acc_saturating),
"+ MD"=colMeans(acc_saturating_metadata)),
ylab="Mean accuracy",pch=".",ylim=0:1)
text(1.5,max(c(colMeans(acc_saturating),colMeans(acc_saturating_metadata))),
labels=paste("p =",
round(wilcox.test(colMeans(acc_saturating),
colMeans(acc_saturating_metadata))$p.value,
2)),
pos=3,col="red")
rm(list=temp)
rm(list=grep("^temp",ls(),value=T))
```
Since the random forest model accuracy is not improved by the addition of metadata features, gene expression differences must capture enough of the same information as cell line, treatment duration, and dosage.
****