Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit dffccb7

Browse files
committedApr 28, 2022
updated plots
1 parent ecb1a89 commit dffccb7

File tree

1 file changed

+1396
-197
lines changed

1 file changed

+1396
-197
lines changed
 

‎plots.ipynb

+1,396-197
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,24 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 44,
66
"id": "addeddb1",
77
"metadata": {
88
"ExecuteTime": {
9-
"end_time": "2021-11-09T14:14:19.548112Z",
10-
"start_time": "2021-11-09T14:14:18.859022Z"
9+
"end_time": "2022-04-22T13:36:26.454251Z",
10+
"start_time": "2022-04-22T13:36:26.361572Z"
1111
}
1212
},
13-
"outputs": [],
13+
"outputs": [
14+
{
15+
"name": "stdout",
16+
"output_type": "stream",
17+
"text": [
18+
"The autoreload extension is already loaded. To reload it, use:\n",
19+
" %reload_ext autoreload\n"
20+
]
21+
}
22+
],
1423
"source": [
1524
"import os\n",
1625
"import numpy as np\n",
@@ -54,6 +63,27 @@
5463
"%autoreload 2"
5564
]
5665
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": 45,
69+
"id": "1159b6aa",
70+
"metadata": {
71+
"ExecuteTime": {
72+
"end_time": "2022-04-22T13:36:29.398161Z",
73+
"start_time": "2022-04-22T13:36:29.374391Z"
74+
}
75+
},
76+
"outputs": [],
77+
"source": [
78+
"def to_seconds(s):\n",
79+
" parts = s.split(':')\n",
80+
" assert len(parts) <= 3\n",
81+
" time = 0.\n",
82+
" for p in parts:\n",
83+
" time = time * 60 + float(p)\n",
84+
" return time"
85+
]
86+
},
5787
{
5888
"cell_type": "markdown",
5989
"id": "aa49ed37",
@@ -74,12 +104,12 @@
74104
},
75105
{
76106
"cell_type": "code",
77-
"execution_count": 2,
107+
"execution_count": 46,
78108
"id": "9f285dd6",
79109
"metadata": {
80110
"ExecuteTime": {
81-
"end_time": "2021-11-09T14:14:19.565071Z",
82-
"start_time": "2021-11-09T14:14:19.550186Z"
111+
"end_time": "2022-04-22T13:36:30.200207Z",
112+
"start_time": "2022-04-22T13:36:30.177042Z"
83113
}
84114
},
85115
"outputs": [],
@@ -101,12 +131,12 @@
101131
},
102132
{
103133
"cell_type": "code",
104-
"execution_count": 3,
134+
"execution_count": 12,
105135
"id": "aa0d8c90",
106136
"metadata": {
107137
"ExecuteTime": {
108-
"end_time": "2021-11-09T14:14:19.623819Z",
109-
"start_time": "2021-11-09T14:14:19.568014Z"
138+
"end_time": "2022-04-01T22:36:17.657643Z",
139+
"start_time": "2022-04-01T22:36:17.546620Z"
110140
},
111141
"scrolled": false
112142
},
@@ -135,12 +165,12 @@
135165
},
136166
{
137167
"cell_type": "code",
138-
"execution_count": 4,
168+
"execution_count": 13,
139169
"id": "ae45466d",
140170
"metadata": {
141171
"ExecuteTime": {
142-
"end_time": "2021-11-09T14:14:19.643938Z",
143-
"start_time": "2021-11-09T14:14:19.625845Z"
172+
"end_time": "2022-04-01T22:36:18.374915Z",
173+
"start_time": "2022-04-01T22:36:18.336702Z"
144174
}
145175
},
146176
"outputs": [],
@@ -164,12 +194,12 @@
164194
},
165195
{
166196
"cell_type": "code",
167-
"execution_count": 5,
197+
"execution_count": 14,
168198
"id": "c82fff4b",
169199
"metadata": {
170200
"ExecuteTime": {
171-
"end_time": "2021-11-09T14:14:19.661601Z",
172-
"start_time": "2021-11-09T14:14:19.645460Z"
201+
"end_time": "2022-04-01T22:36:18.922091Z",
202+
"start_time": "2022-04-01T22:36:18.883641Z"
173203
}
174204
},
175205
"outputs": [],
@@ -193,12 +223,12 @@
193223
},
194224
{
195225
"cell_type": "code",
196-
"execution_count": 6,
226+
"execution_count": 15,
197227
"id": "f5cd92dc",
198228
"metadata": {
199229
"ExecuteTime": {
200-
"end_time": "2021-11-09T14:14:19.678527Z",
201-
"start_time": "2021-11-09T14:14:19.663047Z"
230+
"end_time": "2022-04-01T22:36:19.450515Z",
231+
"start_time": "2022-04-01T22:36:19.414664Z"
202232
}
203233
},
204234
"outputs": [],
@@ -225,12 +255,104 @@
225255
},
226256
{
227257
"cell_type": "code",
228-
"execution_count": 7,
258+
"execution_count": 16,
259+
"id": "831d9053",
260+
"metadata": {
261+
"ExecuteTime": {
262+
"end_time": "2022-04-01T22:36:20.161981Z",
263+
"start_time": "2022-04-01T22:36:20.125833Z"
264+
}
265+
},
266+
"outputs": [],
267+
"source": [
268+
"with open('mtg_size_graph.pkl', 'rb') as f:\n",
269+
" mtg_size_graph = pickle.load(f)\n",
270+
"\n",
271+
"# mtg_size_graph = []\n",
272+
"\n",
273+
"# DIR = !echo $HOME\n",
274+
"# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg'\n",
275+
"\n",
276+
"# for x in gzip:\n",
277+
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
278+
"# mtg_size_graph.append(get_size(f'{DIR}/{ID[:6]}/{ID}/graph_small.dbg'))\n",
279+
"\n",
280+
"# mtg_size_graph = np.array(mtg_size_graph)\n",
281+
"\n",
282+
"# with open('mtg_size_graph.pkl', 'wb') as f:\n",
283+
"# pickle.dump(mtg_size_graph, f)"
284+
]
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": 17,
289+
"id": "a1a2acbd",
290+
"metadata": {
291+
"ExecuteTime": {
292+
"end_time": "2022-04-01T22:36:20.778732Z",
293+
"start_time": "2022-04-01T22:36:20.741770Z"
294+
}
295+
},
296+
"outputs": [],
297+
"source": [
298+
"with open('mtg_size_anno.pkl', 'rb') as f:\n",
299+
" mtg_size_anno = pickle.load(f)\n",
300+
"\n",
301+
"# mtg_size_anno = []\n",
302+
"\n",
303+
"# DIR = !echo $HOME\n",
304+
"# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg'\n",
305+
"\n",
306+
"# for x in gzip:\n",
307+
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
308+
"# mtg_size_anno.append(get_size(f'{DIR}/{ID[:6]}/{ID}/annotation.row_diff_coord.annodbg'))\n",
309+
"\n",
310+
"# mtg_size_anno = np.array(mtg_size_anno)\n",
311+
"\n",
312+
"# with open('mtg_size_anno.pkl', 'wb') as f:\n",
313+
"# pickle.dump(mtg_size_anno, f)"
314+
]
315+
},
316+
{
317+
"cell_type": "code",
318+
"execution_count": 18,
319+
"id": "a90f1ce3",
320+
"metadata": {
321+
"ExecuteTime": {
322+
"end_time": "2022-04-01T22:36:21.526690Z",
323+
"start_time": "2022-04-01T22:36:21.491789Z"
324+
}
325+
},
326+
"outputs": [],
327+
"source": [
328+
"with open('mtg2_size.pkl', 'rb') as f:\n",
329+
" mtg2_size = pickle.load(f)\n",
330+
"\n",
331+
"# mtg2_size = []\n",
332+
"\n",
333+
"# DIR = !echo $HOME\n",
334+
"# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg_fork_opt'\n",
335+
"\n",
336+
"# for x in gzip:\n",
337+
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
338+
"# graph_size = get_size(f'{DIR}/{ID[:6]}/{ID}/graph_small.dbg')\n",
339+
"# anno_size = get_size(f'{DIR}/{ID[:6]}/{ID}/annotation.row_diff_coord.annodbg')\n",
340+
"# mtg2_size.append(graph_size + anno_size)\n",
341+
"\n",
342+
"# mtg2_size = np.array(mtg2_size)\n",
343+
"\n",
344+
"# with open('mtg2_size.pkl', 'wb') as f:\n",
345+
"# pickle.dump(mtg2_size, f)"
346+
]
347+
},
348+
{
349+
"cell_type": "code",
350+
"execution_count": 19,
229351
"id": "48eee7cf",
230352
"metadata": {
231353
"ExecuteTime": {
232-
"end_time": "2021-11-09T14:14:19.695733Z",
233-
"start_time": "2021-11-09T14:14:19.680784Z"
354+
"end_time": "2022-04-01T22:36:22.121466Z",
355+
"start_time": "2022-04-01T22:36:22.083854Z"
234356
}
235357
},
236358
"outputs": [],
@@ -257,12 +379,12 @@
257379
},
258380
{
259381
"cell_type": "code",
260-
"execution_count": 8,
382+
"execution_count": 20,
261383
"id": "5238f351",
262384
"metadata": {
263385
"ExecuteTime": {
264-
"end_time": "2021-11-09T14:14:19.712975Z",
265-
"start_time": "2021-11-09T14:14:19.697596Z"
386+
"end_time": "2022-04-01T22:36:22.611761Z",
387+
"start_time": "2022-04-01T22:36:22.572974Z"
266388
}
267389
},
268390
"outputs": [],
@@ -289,12 +411,12 @@
289411
},
290412
{
291413
"cell_type": "code",
292-
"execution_count": 9,
414+
"execution_count": 21,
293415
"id": "9261d1df",
294416
"metadata": {
295417
"ExecuteTime": {
296-
"end_time": "2021-11-09T14:14:19.729536Z",
297-
"start_time": "2021-11-09T14:14:19.714366Z"
418+
"end_time": "2022-04-01T22:36:23.855207Z",
419+
"start_time": "2022-04-01T22:36:23.816565Z"
298420
}
299421
},
300422
"outputs": [],
@@ -321,12 +443,12 @@
321443
},
322444
{
323445
"cell_type": "code",
324-
"execution_count": 10,
446+
"execution_count": 22,
325447
"id": "07b89508",
326448
"metadata": {
327449
"ExecuteTime": {
328-
"end_time": "2021-11-09T14:14:19.767464Z",
329-
"start_time": "2021-11-09T14:14:19.730872Z"
450+
"end_time": "2022-04-01T22:36:24.483454Z",
451+
"start_time": "2022-04-01T22:36:24.426837Z"
330452
}
331453
},
332454
"outputs": [
@@ -336,7 +458,7 @@
336458
"149.967152078"
337459
]
338460
},
339-
"execution_count": 10,
461+
"execution_count": 22,
340462
"metadata": {},
341463
"output_type": "execute_result"
342464
}
@@ -347,12 +469,12 @@
347469
},
348470
{
349471
"cell_type": "code",
350-
"execution_count": 11,
472+
"execution_count": 23,
351473
"id": "5c1470a5",
352474
"metadata": {
353475
"ExecuteTime": {
354-
"end_time": "2021-11-09T14:14:19.865413Z",
355-
"start_time": "2021-11-09T14:14:19.768843Z"
476+
"end_time": "2022-04-01T22:36:25.084414Z",
477+
"start_time": "2022-04-01T22:36:25.031836Z"
356478
},
357479
"scrolled": true
358480
},
@@ -363,7 +485,7 @@
363485
"875.484502657"
364486
]
365487
},
366-
"execution_count": 11,
488+
"execution_count": 23,
367489
"metadata": {},
368490
"output_type": "execute_result"
369491
}
@@ -374,12 +496,12 @@
374496
},
375497
{
376498
"cell_type": "code",
377-
"execution_count": 12,
499+
"execution_count": 24,
378500
"id": "22ce3594",
379501
"metadata": {
380502
"ExecuteTime": {
381-
"end_time": "2021-11-09T14:14:20.152458Z",
382-
"start_time": "2021-11-09T14:14:19.869394Z"
503+
"end_time": "2022-04-01T22:36:29.136227Z",
504+
"start_time": "2022-04-01T22:36:28.682322Z"
383505
},
384506
"scrolled": true
385507
},
@@ -392,7 +514,8 @@
392514
"Pufferfish compression: 0.3\n",
393515
"BLAST compression: 3.1\n",
394516
"gzip -9 compression: 5.8\n",
395-
"mtg compression: 6.2\n",
517+
"mtg (single succ.) compression: 6.2\n",
518+
"mtg (multiple succ.) compression: 7.1\n",
396519
"Spring compression: 18.0\n"
397520
]
398521
}
@@ -402,18 +525,19 @@
402525
"print(f\"Pufferfish compression: {sum(num_bp) / sum(pfish_size):.1f}\")\n",
403526
"print(f\"BLAST compression: {sum(num_bp) / sum(blast_size):.1f}\")\n",
404527
"print(f\"gzip -9 compression: {sum(num_bp) / sum(gzip_size):.1f}\")\n",
405-
"print(f\"mtg compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n",
528+
"print(f\"mtg (single succ.) compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n",
529+
"print(f\"mtg (multiple succ.) compression: {sum(num_bp[mtg2_size > 0]) / sum(mtg2_size[mtg2_size > 0]):.1f}\")\n",
406530
"print(f\"Spring compression: {sum(num_bp) / sum(spring_size):.1f}\")"
407531
]
408532
},
409533
{
410534
"cell_type": "code",
411-
"execution_count": 13,
535+
"execution_count": 25,
412536
"id": "c129057a",
413537
"metadata": {
414538
"ExecuteTime": {
415-
"end_time": "2021-11-09T14:14:20.435271Z",
416-
"start_time": "2021-11-09T14:14:20.154822Z"
539+
"end_time": "2022-04-01T22:36:30.071763Z",
540+
"start_time": "2022-04-01T22:36:29.713198Z"
417541
},
418542
"scrolled": true
419543
},
@@ -426,7 +550,8 @@
426550
"Pufferfish bits/bp: 25.113\n",
427551
"BLAST bits/bp: 2.580\n",
428552
"gzip -9 bits/bp: 1.370\n",
429-
"mtg bits/bp: 1.300\n",
553+
"mtg (single succ.) bits/bp: 1.300\n",
554+
"mtg (multiple succ.) bits/bp: 1.130\n",
430555
"Spring bits/bp: 0.445\n"
431556
]
432557
}
@@ -436,18 +561,19 @@
436561
"print(f\"Pufferfish bits/bp: {sum(pfish_size) * 8 / sum(num_bp):.3f}\")\n",
437562
"print(f\"BLAST bits/bp: {sum(blast_size) * 8 / sum(num_bp):.3f}\")\n",
438563
"print(f\"gzip -9 bits/bp: {sum(gzip_size) * 8 / sum(num_bp):.3f}\")\n",
439-
"print(f\"mtg bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n",
564+
"print(f\"mtg (single succ.) bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n",
565+
"print(f\"mtg (multiple succ.) bits/bp: {sum(mtg2_size[mtg2_size > 0]) * 8 / sum(num_bp[mtg2_size > 0]):.3f}\")\n",
440566
"print(f\"Spring bits/bp: {sum(spring_size) * 8 / sum(num_bp):.3f}\")"
441567
]
442568
},
443569
{
444570
"cell_type": "code",
445-
"execution_count": 14,
571+
"execution_count": 26,
446572
"id": "6807effa",
447573
"metadata": {
448574
"ExecuteTime": {
449-
"end_time": "2021-11-09T14:14:20.458287Z",
450-
"start_time": "2021-11-09T14:14:20.439209Z"
575+
"end_time": "2022-04-01T22:36:31.610803Z",
576+
"start_time": "2022-04-01T22:36:31.582740Z"
451577
}
452578
},
453579
"outputs": [
@@ -457,7 +583,7 @@
457583
"152884"
458584
]
459585
},
460-
"execution_count": 14,
586+
"execution_count": 26,
461587
"metadata": {},
462588
"output_type": "execute_result"
463589
}
@@ -485,12 +611,12 @@
485611
},
486612
{
487613
"cell_type": "code",
488-
"execution_count": 15,
614+
"execution_count": 27,
489615
"id": "03641724",
490616
"metadata": {
491617
"ExecuteTime": {
492-
"end_time": "2021-11-09T14:14:22.748943Z",
493-
"start_time": "2021-11-09T14:14:20.459756Z"
618+
"end_time": "2022-04-01T22:36:38.926734Z",
619+
"start_time": "2022-04-01T22:36:35.666505Z"
494620
}
495621
},
496622
"outputs": [
@@ -536,12 +662,12 @@
536662
},
537663
{
538664
"cell_type": "code",
539-
"execution_count": 16,
665+
"execution_count": 28,
540666
"id": "255041f2",
541667
"metadata": {
542668
"ExecuteTime": {
543-
"end_time": "2021-11-09T14:14:23.190372Z",
544-
"start_time": "2021-11-09T14:14:22.752090Z"
669+
"end_time": "2022-04-01T22:36:38.951695Z",
670+
"start_time": "2022-04-01T22:36:38.929720Z"
545671
}
546672
},
547673
"outputs": [
@@ -559,12 +685,12 @@
559685
},
560686
{
561687
"cell_type": "code",
562-
"execution_count": 17,
688+
"execution_count": 29,
563689
"id": "ff17867c",
564690
"metadata": {
565691
"ExecuteTime": {
566-
"end_time": "2021-11-09T14:14:23.501441Z",
567-
"start_time": "2021-11-09T14:14:23.194795Z"
692+
"end_time": "2022-04-01T22:36:39.247516Z",
693+
"start_time": "2022-04-01T22:36:38.956001Z"
568694
}
569695
},
570696
"outputs": [
@@ -574,7 +700,7 @@
574700
"152418"
575701
]
576702
},
577-
"execution_count": 17,
703+
"execution_count": 29,
578704
"metadata": {},
579705
"output_type": "execute_result"
580706
}
@@ -585,12 +711,12 @@
585711
},
586712
{
587713
"cell_type": "code",
588-
"execution_count": 18,
714+
"execution_count": 30,
589715
"id": "c8d921fe",
590716
"metadata": {
591717
"ExecuteTime": {
592-
"end_time": "2021-11-09T14:14:23.569444Z",
593-
"start_time": "2021-11-09T14:14:23.505170Z"
718+
"end_time": "2022-04-01T22:36:39.320067Z",
719+
"start_time": "2022-04-01T22:36:39.251631Z"
594720
}
595721
},
596722
"outputs": [
@@ -600,7 +726,7 @@
600726
"38.4434618997873"
601727
]
602728
},
603-
"execution_count": 18,
729+
"execution_count": 30,
604730
"metadata": {},
605731
"output_type": "execute_result"
606732
}
@@ -619,12 +745,12 @@
619745
},
620746
{
621747
"cell_type": "code",
622-
"execution_count": 19,
748+
"execution_count": 31,
623749
"id": "822619af",
624750
"metadata": {
625751
"ExecuteTime": {
626-
"end_time": "2021-11-09T14:14:23.696575Z",
627-
"start_time": "2021-11-09T14:14:23.570820Z"
752+
"end_time": "2022-04-01T22:36:39.457479Z",
753+
"start_time": "2022-04-01T22:36:39.326006Z"
628754
}
629755
},
630756
"outputs": [
@@ -1107,12 +1233,12 @@
11071233
},
11081234
{
11091235
"cell_type": "code",
1110-
"execution_count": 20,
1236+
"execution_count": 32,
11111237
"id": "5ed9f0c7",
11121238
"metadata": {
11131239
"ExecuteTime": {
1114-
"end_time": "2021-11-09T14:14:23.858803Z",
1115-
"start_time": "2021-11-09T14:14:23.698367Z"
1240+
"end_time": "2022-04-01T22:36:39.675712Z",
1241+
"start_time": "2022-04-01T22:36:39.461357Z"
11161242
}
11171243
},
11181244
"outputs": [],
@@ -1124,18 +1250,21 @@
11241250
"spring_size = spring_size[idx]\n",
11251251
"pfish_size = pfish_size[idx]\n",
11261252
"mtg_size = mtg_size[idx]\n",
1253+
"mtg_size_graph = mtg_size_graph[idx]\n",
1254+
"mtg_size_anno = mtg_size_anno[idx]\n",
1255+
"mtg2_size = mtg2_size[idx]\n",
11271256
"blast_size = blast_size[idx]\n",
11281257
"mgblst_size = mgblst_size[idx]"
11291258
]
11301259
},
11311260
{
11321261
"cell_type": "code",
1133-
"execution_count": 21,
1262+
"execution_count": 33,
11341263
"id": "30526982",
11351264
"metadata": {
11361265
"ExecuteTime": {
1137-
"end_time": "2021-11-09T14:14:23.900052Z",
1138-
"start_time": "2021-11-09T14:14:23.860557Z"
1266+
"end_time": "2022-04-01T22:36:40.670070Z",
1267+
"start_time": "2022-04-01T22:36:40.620378Z"
11391268
}
11401269
},
11411270
"outputs": [
@@ -1145,7 +1274,7 @@
11451274
"111.599186766"
11461275
]
11471276
},
1148-
"execution_count": 21,
1277+
"execution_count": 33,
11491278
"metadata": {},
11501279
"output_type": "execute_result"
11511280
}
@@ -1156,12 +1285,12 @@
11561285
},
11571286
{
11581287
"cell_type": "code",
1159-
"execution_count": 22,
1288+
"execution_count": 34,
11601289
"id": "cc211f0c",
11611290
"metadata": {
11621291
"ExecuteTime": {
1163-
"end_time": "2021-11-09T14:14:23.938646Z",
1164-
"start_time": "2021-11-09T14:14:23.903446Z"
1292+
"end_time": "2022-04-01T22:36:42.411850Z",
1293+
"start_time": "2022-04-01T22:36:42.360430Z"
11651294
},
11661295
"scrolled": true
11671296
},
@@ -1172,7 +1301,7 @@
11721301
"716.726703751"
11731302
]
11741303
},
1175-
"execution_count": 22,
1304+
"execution_count": 34,
11761305
"metadata": {},
11771306
"output_type": "execute_result"
11781307
}
@@ -1183,12 +1312,12 @@
11831312
},
11841313
{
11851314
"cell_type": "code",
1186-
"execution_count": 23,
1315+
"execution_count": 35,
11871316
"id": "997fc48e",
11881317
"metadata": {
11891318
"ExecuteTime": {
1190-
"end_time": "2021-11-09T14:14:24.215738Z",
1191-
"start_time": "2021-11-09T14:14:23.940174Z"
1319+
"end_time": "2022-04-01T22:36:46.216869Z",
1320+
"start_time": "2022-04-01T22:36:45.876951Z"
11921321
},
11931322
"scrolled": true
11941323
},
@@ -1201,7 +1330,8 @@
12011330
"Pufferfish compression: 0.4\n",
12021331
"BLAST compression: 3.0\n",
12031332
"gzip -9 compression: 6.4\n",
1204-
"mtg compression: 14.7\n",
1333+
"mtg (single succ.) compression: 14.7\n",
1334+
"mtg (multiple succ.) compression: nan\n",
12051335
"Spring compression: 38.4\n"
12061336
]
12071337
}
@@ -1211,18 +1341,19 @@
12111341
"print(f\"Pufferfish compression: {sum(num_bp) / sum(pfish_size):.1f}\")\n",
12121342
"print(f\"BLAST compression: {sum(num_bp) / sum(blast_size):.1f}\")\n",
12131343
"print(f\"gzip -9 compression: {sum(num_bp) / sum(gzip_size):.1f}\")\n",
1214-
"print(f\"mtg compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n",
1344+
"print(f\"mtg (single succ.) compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n",
1345+
"print(f\"mtg (multiple succ.) compression: {sum(num_bp) / sum(mtg2_size):.1f}\")\n",
12151346
"print(f\"Spring compression: {sum(num_bp) / sum(spring_size):.1f}\")"
12161347
]
12171348
},
12181349
{
12191350
"cell_type": "code",
1220-
"execution_count": 24,
1351+
"execution_count": 36,
12211352
"id": "880c0ce0",
12221353
"metadata": {
12231354
"ExecuteTime": {
1224-
"end_time": "2021-11-09T14:14:24.504474Z",
1225-
"start_time": "2021-11-09T14:14:24.217362Z"
1355+
"end_time": "2022-04-01T22:36:46.823554Z",
1356+
"start_time": "2022-04-01T22:36:46.488602Z"
12261357
},
12271358
"scrolled": true
12281359
},
@@ -1235,7 +1366,8 @@
12351366
"Pufferfish bits/bp: 21.816\n",
12361367
"BLAST bits/bp: 2.678\n",
12371368
"gzip -9 bits/bp: 1.246\n",
1238-
"mtg bits/bp: 0.544\n",
1369+
"mtg (single succ.) bits/bp: 0.544\n",
1370+
"mtg (multiple succ.) bits/bp: nan\n",
12391371
"Spring bits/bp: 0.208\n"
12401372
]
12411373
}
@@ -1245,10 +1377,36 @@
12451377
"print(f\"Pufferfish bits/bp: {sum(pfish_size) * 8 / sum(num_bp):.3f}\")\n",
12461378
"print(f\"BLAST bits/bp: {sum(blast_size) * 8 / sum(num_bp):.3f}\")\n",
12471379
"print(f\"gzip -9 bits/bp: {sum(gzip_size) * 8 / sum(num_bp):.3f}\")\n",
1248-
"print(f\"mtg bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n",
1380+
"print(f\"mtg (single succ.) bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n",
1381+
"print(f\"mtg (multiple succ.) bits/bp: {sum(mtg2_size) * 8 / sum(num_bp):.3f}\")\n",
12491382
"print(f\"Spring bits/bp: {sum(spring_size) * 8 / sum(num_bp):.3f}\")"
12501383
]
12511384
},
1385+
{
1386+
"cell_type": "code",
1387+
"execution_count": 37,
1388+
"id": "fe96ab33",
1389+
"metadata": {
1390+
"ExecuteTime": {
1391+
"end_time": "2022-04-01T22:36:48.428809Z",
1392+
"start_time": "2022-04-01T22:36:48.312386Z"
1393+
}
1394+
},
1395+
"outputs": [
1396+
{
1397+
"name": "stdout",
1398+
"output_type": "stream",
1399+
"text": [
1400+
"mtg (single succ.), graph only, bits/bp: 0.151\n",
1401+
"mtg (single succ.), anno only, bits/bp: 0.393\n"
1402+
]
1403+
}
1404+
],
1405+
"source": [
1406+
"print(f\"mtg (single succ.), graph only, bits/bp: {sum(mtg_size_graph) * 8 / sum(num_bp):.3f}\")\n",
1407+
"print(f\"mtg (single succ.), anno only, bits/bp: {sum(mtg_size_anno) * 8 / sum(num_bp):.3f}\")"
1408+
]
1409+
},
12521410
{
12531411
"cell_type": "code",
12541412
"execution_count": 25,
@@ -1468,12 +1626,14 @@
14681626
"\n",
14691627
"violin_data = [\n",
14701628
" 8 * spring_size / num_bp,\n",
1629+
" 8 * mtg2_size / num_bp,\n",
14711630
" 8 * mtg_size / num_bp,\n",
14721631
" 8 * gzip_size / num_bp,\n",
14731632
" 8 * blast_size / num_bp,\n",
14741633
"]\n",
14751634
"labels = ['Spring (*)',\n",
1476-
" 'Counting DBG\\nwith coordinates',\n",
1635+
" 'Counting DBG with coord.\\n(multiple successors)',\n",
1636+
" 'Counting DBG with coord.\\n(single successor)',\n",
14771637
" 'gzip -9 (*)',\n",
14781638
" 'BLAST',]\n",
14791639
"violin_weights = [num_bp] * len(labels)\n",
@@ -1623,157 +1783,805 @@
16231783
},
16241784
{
16251785
"cell_type": "markdown",
1626-
"id": "03209b5d",
1786+
"id": "feb55e84",
16271787
"metadata": {},
16281788
"source": [
1629-
"## Illumina RNA-Seq reads"
1789+
"#### Query time"
16301790
]
16311791
},
16321792
{
16331793
"cell_type": "code",
1634-
"execution_count": 30,
1635-
"id": "3d872be3",
1794+
"execution_count": 47,
1795+
"id": "98242b99",
16361796
"metadata": {
16371797
"ExecuteTime": {
1638-
"end_time": "2021-11-09T14:14:37.136911Z",
1639-
"start_time": "2021-11-09T14:14:37.108300Z"
1798+
"end_time": "2022-04-22T13:36:54.737797Z",
1799+
"start_time": "2022-04-22T13:36:54.714619Z"
16401800
}
16411801
},
16421802
"outputs": [],
16431803
"source": [
1644-
"with open('kingsford_gzip_spring.pkl', 'rb') as f:\n",
1645-
" gzip, read_length, num_bp, gzip_size, fasta_size, spring_size = pickle.load(f)\n",
1646-
"\n",
1647-
"# gzip = !ls ~/metagenome/data/kingsford/compressed/*_no_header.fasta.gz\n",
1648-
"\n",
1649-
"# num_bp = []\n",
1650-
"# read_length = []\n",
1651-
"# spring_size = []\n",
1652-
"# gzip_size = []\n",
1653-
"# fasta_size = []\n",
1654-
"\n",
1655-
"# for x in tqdm(gzip, mininterval=1):\n",
1656-
"# gzip_size.append(get_size(x))\n",
1657-
"# fasta_size.append(get_size(x[:-len('.gz')]))\n",
1658-
"# ID = x[:-len('_no_header.fasta.gz')]\n",
1659-
"# spring_fname = f'{ID}_no_header.spring'\n",
1660-
"# spring_size.append(get_size(spring_fname))\n",
1661-
"# SRA = ID.split('/')[-1]\n",
1662-
"# fname = x[:-len('_no_header.fasta.gz')] + '.num_bp'\n",
1663-
"# with open(fname, 'r') as f:\n",
1664-
"# n = int(f.readline())\n",
1665-
"# num_bp.append(n)\n",
1666-
"\n",
1667-
"# with gz.open(x, 'r') as f:\n",
1668-
"# f.readline().strip().decode()\n",
1669-
"# seq = f.readline().strip().decode()\n",
1670-
"# L = len(seq)\n",
1671-
"# all_same_len = True\n",
1672-
"# for i in range(1000): # check 1000 first reads\n",
1673-
"# f.readline().strip().decode()\n",
1674-
"# if L != len(f.readline().strip().decode()):\n",
1675-
"# all_same_len = False\n",
1676-
"# break\n",
1677-
"# read_length.append(L if all_same_len and num_bp[-1] % L == 0 else np.nan)\n",
1678-
"\n",
1679-
"# gzip = np.array(gzip)\n",
1680-
"# num_bp = np.array(num_bp)\n",
1681-
"# spring_size = np.array(spring_size)\n",
1682-
"# gzip_size = np.array(gzip_size)\n",
1683-
"# fasta_size = np.array(fasta_size)\n",
1684-
"# read_length = np.array(read_length)\n",
1685-
"\n",
1686-
"# with open('kingsford_gzip_spring.pkl', 'wb') as f:\n",
1687-
"# pickle.dump((gzip, read_length, num_bp, gzip_size, fasta_size, spring_size), f)"
1804+
"import pandas as pd"
16881805
]
16891806
},
16901807
{
16911808
"cell_type": "code",
1692-
"execution_count": 31,
1693-
"id": "4ffcdd65",
1809+
"execution_count": 200,
1810+
"id": "1d6fd607",
16941811
"metadata": {
16951812
"ExecuteTime": {
1696-
"end_time": "2021-11-09T14:14:37.162442Z",
1697-
"start_time": "2021-11-09T14:14:37.140570Z"
1813+
"end_time": "2022-04-26T13:10:41.760762Z",
1814+
"start_time": "2022-04-26T13:09:19.866872Z"
16981815
},
16991816
"scrolled": false
17001817
},
17011818
"outputs": [
17021819
{
1703-
"data": {
1704-
"text/plain": [
1705-
"(2652, 7.973392066664)"
1706-
]
1707-
},
1708-
"execution_count": 31,
1709-
"metadata": {},
1710-
"output_type": "execute_result"
1820+
"name": "stdout",
1821+
"output_type": "stream",
1822+
"text": [
1823+
"0.09, 0.51\n",
1824+
"0.08, 2.06\n",
1825+
"0.08, 0.11\n",
1826+
"0.15, 0.41\n"
1827+
]
17111828
}
17121829
],
17131830
"source": [
1714-
"len(num_bp), sum(num_bp) / 1e12"
1831+
"def show_time(times_file):\n",
1832+
" df = pd.read_csv(times_file, header=None, sep=' ', names=['id', 'time'], index_col=0)\n",
1833+
" for i in range(df.shape[0]):\n",
1834+
" try:\n",
1835+
" df.iloc[i].time = to_seconds(df.iloc[i].time)\n",
1836+
" except:\n",
1837+
" print(df.iloc[i])\n",
1838+
" df.iloc[i].time = 0\n",
1839+
" print(f'{df.time.mean():.2f}, {df.time.std():.2f}')\n",
1840+
"\n",
1841+
"#print(show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_query.times'))\n",
1842+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/megablast/delta_variants_query.times')\n",
1843+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/pufferfish_sparse/delta_variants_align.times')\n",
1844+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/blast/delta_variants_query.times')\n",
1845+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align.times')"
17151846
]
17161847
},
17171848
{
17181849
"cell_type": "code",
1719-
"execution_count": 32,
1720-
"id": "cae57556",
1850+
"execution_count": 202,
1851+
"id": "31c0f6a7",
1852+
"metadata": {
1853+
"ExecuteTime": {
1854+
"end_time": "2022-04-26T13:16:29.246970Z",
1855+
"start_time": "2022-04-26T13:15:07.360418Z"
1856+
}
1857+
},
1858+
"outputs": [
1859+
{
1860+
"name": "stdout",
1861+
"output_type": "stream",
1862+
"text": [
1863+
"0.09, 0.49\n",
1864+
"0.06, 1.06\n",
1865+
"0.08, 0.10\n",
1866+
"0.15, 0.40\n"
1867+
]
1868+
}
1869+
],
1870+
"source": [
1871+
"def show_time(times_file):\n",
1872+
" df = pd.read_csv(times_file, header=None, sep=' ', names=['id', 'time'], index_col=0)\n",
1873+
" df = df.loc[np.array([x.split('/')[-1][:-len('_no_header.fasta.gz')] for x in gzip])]\n",
1874+
" for i in range(df.shape[0]):\n",
1875+
" try:\n",
1876+
" df.iloc[i].time = to_seconds(df.iloc[i].time)\n",
1877+
" except:\n",
1878+
" print(df.iloc[i])\n",
1879+
" df.iloc[i].time = 0\n",
1880+
" print(f'{df.time.mean():.2f}, {df.time.std():.2f}')\n",
1881+
"\n",
1882+
"#show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_query.times')\n",
1883+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/megablast/delta_variants_query.times')\n",
1884+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/pufferfish_sparse/delta_variants_align.times')\n",
1885+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/blast/delta_variants_query.times')\n",
1886+
"show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align.times')"
1887+
]
1888+
},
1889+
{
1890+
"cell_type": "code",
1891+
"execution_count": 61,
1892+
"id": "dc2bfb02",
17211893
"metadata": {
17221894
"ExecuteTime": {
1723-
"end_time": "2021-11-09T14:14:37.183886Z",
1724-
"start_time": "2021-11-09T14:14:37.166181Z"
1895+
"end_time": "2022-04-22T14:33:47.588583Z",
1896+
"start_time": "2022-04-22T14:33:47.431837Z"
17251897
}
17261898
},
17271899
"outputs": [],
17281900
"source": [
1729-
"with open('kingsford_mtg_size.pkl', 'rb') as f:\n",
1730-
" mtg_size = pickle.load(f)\n",
1731-
"\n",
1732-
"# mtg_size = []\n",
1733-
"\n",
1734-
"# DIR = !echo $HOME\n",
1735-
"# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates/'\n",
1736-
"\n",
1737-
"# for x in tqdm(gzip, mininterval=1):\n",
1738-
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
1739-
"# graph_size = get_size(f'{DIR}/{ID}.fasta.gz/graph_small.dbg')\n",
1740-
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n",
1741-
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n",
1742-
"# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n",
1743-
"# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n",
1744-
"# mtg_size.append(graph_size + get_size(column_fname) + get_size(coords_fname))\n",
1745-
"\n",
1746-
"# mtg_size = np.array(mtg_size)\n",
1747-
"\n",
1748-
"# with open('kingsford_mtg_size.pkl', 'wb') as f:\n",
1749-
"# pickle.dump(mtg_size, f)"
1901+
"df = pd.read_csv('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align_chain_optim.times', header=None, sep=' ', names=['id', 'time'], index_col=0)"
17501902
]
17511903
},
17521904
{
17531905
"cell_type": "code",
1754-
"execution_count": 33,
1755-
"id": "09188d87",
1906+
"execution_count": 66,
1907+
"id": "92903856",
17561908
"metadata": {
17571909
"ExecuteTime": {
1758-
"end_time": "2021-11-09T14:14:37.201542Z",
1759-
"start_time": "2021-11-09T14:14:37.185480Z"
1760-
},
1761-
"scrolled": true
1910+
"end_time": "2022-04-22T14:34:52.219848Z",
1911+
"start_time": "2022-04-22T14:34:32.462326Z"
1912+
}
17621913
},
17631914
"outputs": [],
17641915
"source": [
1765-
"with open('kingsford_mtg_2_size.pkl', 'rb') as f:\n",
1766-
" mtg2_size = pickle.load(f)\n",
1767-
"\n",
1768-
"# mtg2_size = []\n",
1769-
"\n",
1770-
"# DIR = !echo $HOME\n",
1771-
"# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates_fork_opt/'\n",
1772-
"\n",
1773-
"# for x in tqdm(gzip, mininterval=1):\n",
1774-
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
1775-
"# graph_size = get_size(f'{DIR}../kingsford_31_coordinates/{ID}.fasta.gz/graph_small.dbg')\n",
1776-
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n",
1916+
"for i in range(df.shape[0]):\n",
1917+
" df.iloc[i].time = to_seconds(df.iloc[i].time)"
1918+
]
1919+
},
1920+
{
1921+
"cell_type": "code",
1922+
"execution_count": 69,
1923+
"id": "a691b9a5",
1924+
"metadata": {
1925+
"ExecuteTime": {
1926+
"end_time": "2022-04-22T14:35:05.536232Z",
1927+
"start_time": "2022-04-22T14:35:05.499065Z"
1928+
}
1929+
},
1930+
"outputs": [
1931+
{
1932+
"data": {
1933+
"text/html": [
1934+
"<div>\n",
1935+
"<style scoped>\n",
1936+
" .dataframe tbody tr th:only-of-type {\n",
1937+
" vertical-align: middle;\n",
1938+
" }\n",
1939+
"\n",
1940+
" .dataframe tbody tr th {\n",
1941+
" vertical-align: top;\n",
1942+
" }\n",
1943+
"\n",
1944+
" .dataframe thead th {\n",
1945+
" text-align: right;\n",
1946+
" }\n",
1947+
"</style>\n",
1948+
"<table border=\"1\" class=\"dataframe\">\n",
1949+
" <thead>\n",
1950+
" <tr style=\"text-align: right;\">\n",
1951+
" <th></th>\n",
1952+
" <th>time</th>\n",
1953+
" </tr>\n",
1954+
" <tr>\n",
1955+
" <th>id</th>\n",
1956+
" <th></th>\n",
1957+
" </tr>\n",
1958+
" </thead>\n",
1959+
" <tbody>\n",
1960+
" <tr>\n",
1961+
" <th>SRR13144527</th>\n",
1962+
" <td>2133.2</td>\n",
1963+
" </tr>\n",
1964+
" </tbody>\n",
1965+
"</table>\n",
1966+
"</div>"
1967+
],
1968+
"text/plain": [
1969+
" time\n",
1970+
"id \n",
1971+
"SRR13144527 2133.2"
1972+
]
1973+
},
1974+
"execution_count": 69,
1975+
"metadata": {},
1976+
"output_type": "execute_result"
1977+
}
1978+
],
1979+
"source": [
1980+
"df[df.time == 2133.2]"
1981+
]
1982+
},
1983+
{
1984+
"cell_type": "markdown",
1985+
"id": "0720b2d6",
1986+
"metadata": {},
1987+
"source": [
1988+
"### Align to RefSeq"
1989+
]
1990+
},
1991+
{
1992+
"cell_type": "markdown",
1993+
"id": "06c11e73",
1994+
"metadata": {},
1995+
"source": [
1996+
"#### Fungi"
1997+
]
1998+
},
1999+
{
2000+
"cell_type": "markdown",
2001+
"id": "2da4c496",
2002+
"metadata": {},
2003+
"source": [
2004+
"BLAST"
2005+
]
2006+
},
2007+
{
2008+
"cell_type": "code",
2009+
"execution_count": 122,
2010+
"id": "94d6597f",
2011+
"metadata": {
2012+
"ExecuteTime": {
2013+
"end_time": "2022-04-24T16:03:53.380371Z",
2014+
"start_time": "2022-04-24T16:03:53.286701Z"
2015+
}
2016+
},
2017+
"outputs": [
2018+
{
2019+
"name": "stdout",
2020+
"output_type": "stream",
2021+
"text": [
2022+
"Runs: 100\n",
2023+
"2.07 sec\n",
2024+
"2.175 GB\n"
2025+
]
2026+
}
2027+
],
2028+
"source": [
2029+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head100.split.fungi.blast.log'\n",
2030+
"\n",
2031+
"lines = !grep 'wall' $FILE\n",
2032+
"\n",
2033+
"print('Runs:', len(lines))\n",
2034+
"print('{:.2f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n",
2035+
"\n",
2036+
"lines = !grep 'Maximum resident set size' $FILE\n",
2037+
"\n",
2038+
"print('{:.3f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2039+
]
2040+
},
2041+
{
2042+
"cell_type": "markdown",
2043+
"id": "24ea90f0",
2044+
"metadata": {
2045+
"ExecuteTime": {
2046+
"end_time": "2022-04-24T16:03:29.832878Z",
2047+
"start_time": "2022-04-24T16:03:29.403798Z"
2048+
}
2049+
},
2050+
"source": [
2051+
"MegaBLAST"
2052+
]
2053+
},
2054+
{
2055+
"cell_type": "code",
2056+
"execution_count": 123,
2057+
"id": "2a859bd3",
2058+
"metadata": {
2059+
"ExecuteTime": {
2060+
"end_time": "2022-04-24T16:03:58.523237Z",
2061+
"start_time": "2022-04-24T16:03:58.441588Z"
2062+
}
2063+
},
2064+
"outputs": [
2065+
{
2066+
"name": "stdout",
2067+
"output_type": "stream",
2068+
"text": [
2069+
"Runs: 100\n",
2070+
"4.31 sec\n",
2071+
"0.034 GB\n"
2072+
]
2073+
}
2074+
],
2075+
"source": [
2076+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head100.split.fungi.megablast.log'\n",
2077+
"\n",
2078+
"lines = !grep 'wall' $FILE\n",
2079+
"\n",
2080+
"print('Runs:', len(lines))\n",
2081+
"print('{:.2f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n",
2082+
"\n",
2083+
"lines = !grep 'Maximum resident set size' $FILE\n",
2084+
"\n",
2085+
"print('{:.3f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2086+
]
2087+
},
2088+
{
2089+
"cell_type": "markdown",
2090+
"id": "65da4202",
2091+
"metadata": {
2092+
"ExecuteTime": {
2093+
"end_time": "2022-04-24T16:05:28.998442Z",
2094+
"start_time": "2022-04-24T16:05:28.513571Z"
2095+
}
2096+
},
2097+
"source": [
2098+
"MetaGraph"
2099+
]
2100+
},
2101+
{
2102+
"cell_type": "code",
2103+
"execution_count": 226,
2104+
"id": "2b8e817e",
2105+
"metadata": {
2106+
"ExecuteTime": {
2107+
"end_time": "2022-04-27T08:36:59.711362Z",
2108+
"start_time": "2022-04-27T08:36:59.617953Z"
2109+
}
2110+
},
2111+
"outputs": [
2112+
{
2113+
"name": "stdout",
2114+
"output_type": "stream",
2115+
"text": [
2116+
"Runs: 100\n",
2117+
"0.021 sec\n",
2118+
"3.4 GB\n"
2119+
]
2120+
}
2121+
],
2122+
"source": [
2123+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head100.split.fungi.align.log'\n",
2124+
"\n",
2125+
"lines = !grep 'processed in' $FILE\n",
2126+
"\n",
2127+
"print('Runs:', len(lines))\n",
2128+
"print('{:.3f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n",
2129+
"\n",
2130+
"lines = !grep 'Maximum resident set size' $FILE\n",
2131+
"print('{:.1f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2132+
]
2133+
},
2134+
{
2135+
"cell_type": "code",
2136+
"execution_count": 228,
2137+
"id": "13a32580",
2138+
"metadata": {
2139+
"ExecuteTime": {
2140+
"end_time": "2022-04-27T08:37:45.259072Z",
2141+
"start_time": "2022-04-27T08:37:44.280431Z"
2142+
}
2143+
},
2144+
"outputs": [
2145+
{
2146+
"name": "stdout",
2147+
"output_type": "stream",
2148+
"text": [
2149+
"Runs: 1\n",
2150+
"14.76 sec\n",
2151+
"3.5 GB\n"
2152+
]
2153+
}
2154+
],
2155+
"source": [
2156+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head1000.fa.fungi.align.log'\n",
2157+
"\n",
2158+
"lines = !grep 'processed in' $FILE\n",
2159+
"\n",
2160+
"print('Runs:', len(lines))\n",
2161+
"print('{:.2f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n",
2162+
"\n",
2163+
"lines = !grep 'Maximum resident set size' $FILE\n",
2164+
"print('{:.1f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2165+
]
2166+
},
2167+
{
2168+
"cell_type": "markdown",
2169+
"id": "8674e8ec",
2170+
"metadata": {},
2171+
"source": [
2172+
"#### All"
2173+
]
2174+
},
2175+
{
2176+
"cell_type": "markdown",
2177+
"id": "81ef66ec",
2178+
"metadata": {},
2179+
"source": [
2180+
"BLAST"
2181+
]
2182+
},
2183+
{
2184+
"cell_type": "code",
2185+
"execution_count": 198,
2186+
"id": "26f9f305",
2187+
"metadata": {
2188+
"ExecuteTime": {
2189+
"end_time": "2022-04-27T07:37:48.430567Z",
2190+
"start_time": "2022-04-27T07:37:48.337182Z"
2191+
},
2192+
"scrolled": true
2193+
},
2194+
"outputs": [
2195+
{
2196+
"name": "stdout",
2197+
"output_type": "stream",
2198+
"text": [
2199+
"Runs: 100\n",
2200+
"353 sec\n",
2201+
"417 GB\n"
2202+
]
2203+
}
2204+
],
2205+
"source": [
2206+
"FILES = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/SRR10002688_1.shuf.head1000.split.SRR10002688_1.shuf.head1000.*.fa.blast.log'\n",
2207+
"\n",
2208+
"lines = !grep 'wall' $FILES\n",
2209+
"\n",
2210+
"print('Runs:', len(lines))\n",
2211+
"print('{:.0f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n",
2212+
"\n",
2213+
"lines = !grep 'Maximum resident set size' $FILES\n",
2214+
"\n",
2215+
"print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2216+
]
2217+
},
2218+
{
2219+
"cell_type": "code",
2220+
"execution_count": 196,
2221+
"id": "6da9777f",
2222+
"metadata": {
2223+
"ExecuteTime": {
2224+
"end_time": "2022-04-26T08:58:01.674094Z",
2225+
"start_time": "2022-04-26T08:58:01.592426Z"
2226+
}
2227+
},
2228+
"outputs": [
2229+
{
2230+
"name": "stdout",
2231+
"output_type": "stream",
2232+
"text": [
2233+
"Runs: 1\n",
2234+
"1857 sec\n",
2235+
"428 GB\n"
2236+
]
2237+
}
2238+
],
2239+
"source": [
2240+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head1000.fa.refseq.blast.log'\n",
2241+
"\n",
2242+
"lines = !grep 'wall' $FILE\n",
2243+
"\n",
2244+
"print('Runs:', len(lines))\n",
2245+
"print('{:.0f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n",
2246+
"\n",
2247+
"lines = !grep 'Maximum resident set size' $FILE\n",
2248+
"\n",
2249+
"print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2250+
]
2251+
},
2252+
{
2253+
"cell_type": "markdown",
2254+
"id": "2751f9a0",
2255+
"metadata": {
2256+
"ExecuteTime": {
2257+
"end_time": "2022-04-24T16:03:29.832878Z",
2258+
"start_time": "2022-04-24T16:03:29.403798Z"
2259+
}
2260+
},
2261+
"source": [
2262+
"MegaBLAST"
2263+
]
2264+
},
2265+
{
2266+
"cell_type": "code",
2267+
"execution_count": 205,
2268+
"id": "72680fa3",
2269+
"metadata": {
2270+
"ExecuteTime": {
2271+
"end_time": "2022-04-27T07:29:01.194861Z",
2272+
"start_time": "2022-04-27T07:28:50.008327Z"
2273+
},
2274+
"scrolled": true
2275+
},
2276+
"outputs": [
2277+
{
2278+
"name": "stdout",
2279+
"output_type": "stream",
2280+
"text": [
2281+
"Runs: 100\n",
2282+
"Runs: 100\n",
2283+
"Runs: 100\n",
2284+
"Runs: 100\n",
2285+
"Runs: 100\n",
2286+
"12.5 sec\n",
2287+
"0.090 GB\n"
2288+
]
2289+
}
2290+
],
2291+
"source": [
2292+
"times = []\n",
2293+
"rams = []\n",
2294+
"\n",
2295+
"for i in [0, 1, 2, 3, 4]:\n",
2296+
" prefix = 'fatnode_try3' if i >= 2 else 'fatnode_try4'\n",
2297+
" FILES = f'/cluster/work/grlab/projects/metagenome/data/refseq/queries/{prefix}/nobackup/refseq_{i}.SRR10002688_1.shuf.head1000.split.SRR10002688_1.shuf.head1000.*.fa.megablast.log'\n",
2298+
"\n",
2299+
" lines = !grep 'wall' $FILES\n",
2300+
" print('Runs:', len(lines))\n",
2301+
"\n",
2302+
" times.append(np.mean([to_seconds(x.split(' ')[-1]) for x in lines]))\n",
2303+
"\n",
2304+
" lines = !grep 'Maximum resident set size' $FILES\n",
2305+
" rams.append(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines]))\n",
2306+
"\n",
2307+
"print('{:.1f} sec'.format(np.sum(times)))\n",
2308+
"print('{:.3f} GB'.format(np.max(rams)))"
2309+
]
2310+
},
2311+
{
2312+
"cell_type": "code",
2313+
"execution_count": 216,
2314+
"id": "4e300d36",
2315+
"metadata": {
2316+
"ExecuteTime": {
2317+
"end_time": "2022-04-27T07:37:48.430567Z",
2318+
"start_time": "2022-04-27T07:37:48.337182Z"
2319+
},
2320+
"scrolled": true
2321+
},
2322+
"outputs": [
2323+
{
2324+
"name": "stdout",
2325+
"output_type": "stream",
2326+
"text": [
2327+
"/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_2.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 2:52.64\n",
2328+
"/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_3.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 6:21.01\n",
2329+
"/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_4.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 3:15.47\n",
2330+
"/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try4/nobackup/refseq_0.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 6:10.30\n",
2331+
"/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try4/nobackup/refseq_1.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 7:02.20\n",
2332+
"Runs: 5\n",
2333+
"1542 sec\n",
2334+
"22.0 GB\n"
2335+
]
2336+
}
2337+
],
2338+
"source": [
2339+
"FILES = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try*/nobackup/refseq_*.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log'\n",
2340+
"\n",
2341+
"lines = !grep 'wall' $FILES\n",
2342+
"for x in lines:\n",
2343+
" print(x)\n",
2344+
"\n",
2345+
"print('Runs:', len(lines))\n",
2346+
"print('{:.0f} sec'.format(np.sum([to_seconds(x.split(' ')[-1]) for x in lines])))\n",
2347+
"\n",
2348+
"lines = !grep 'Maximum resident set size' $FILES\n",
2349+
"\n",
2350+
"print('{:.1f} GB'.format(np.max([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2351+
]
2352+
},
2353+
{
2354+
"cell_type": "markdown",
2355+
"id": "26b7b7b3",
2356+
"metadata": {
2357+
"ExecuteTime": {
2358+
"end_time": "2022-04-24T16:05:28.998442Z",
2359+
"start_time": "2022-04-24T16:05:28.513571Z"
2360+
}
2361+
},
2362+
"source": [
2363+
"MetaGraph"
2364+
]
2365+
},
2366+
{
2367+
"cell_type": "code",
2368+
"execution_count": 220,
2369+
"id": "671888f0",
2370+
"metadata": {
2371+
"ExecuteTime": {
2372+
"end_time": "2022-04-27T08:34:15.332054Z",
2373+
"start_time": "2022-04-27T08:34:15.250214Z"
2374+
}
2375+
},
2376+
"outputs": [
2377+
{
2378+
"name": "stdout",
2379+
"output_type": "stream",
2380+
"text": [
2381+
"Runs: 100\n",
2382+
"0.66 sec\n",
2383+
"500 GB\n"
2384+
]
2385+
}
2386+
],
2387+
"source": [
2388+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head100.split.refseq.align.log'\n",
2389+
"\n",
2390+
"lines = !grep 'processed in' $FILE\n",
2391+
"\n",
2392+
"print('Runs:', len(lines))\n",
2393+
"print('{:.2f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n",
2394+
"\n",
2395+
"lines = !grep 'Maximum resident set size' $FILE\n",
2396+
"print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2397+
]
2398+
},
2399+
{
2400+
"cell_type": "code",
2401+
"execution_count": 218,
2402+
"id": "56c3071f",
2403+
"metadata": {
2404+
"ExecuteTime": {
2405+
"end_time": "2022-04-27T08:33:30.383423Z",
2406+
"start_time": "2022-04-27T08:33:30.249041Z"
2407+
}
2408+
},
2409+
"outputs": [
2410+
{
2411+
"name": "stdout",
2412+
"output_type": "stream",
2413+
"text": [
2414+
"Runs: 1\n",
2415+
"575 sec\n",
2416+
"513 GB\n"
2417+
]
2418+
}
2419+
],
2420+
"source": [
2421+
"FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head1000.fa.refseq.align.log'\n",
2422+
"\n",
2423+
"lines = !grep 'processed in' $FILE\n",
2424+
"\n",
2425+
"print('Runs:', len(lines))\n",
2426+
"print('{:.0f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n",
2427+
"\n",
2428+
"lines = !grep 'Maximum resident set size' $FILE\n",
2429+
"print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))"
2430+
]
2431+
},
2432+
{
2433+
"cell_type": "markdown",
2434+
"id": "03209b5d",
2435+
"metadata": {},
2436+
"source": [
2437+
"## Illumina RNA-Seq reads"
2438+
]
2439+
},
2440+
{
2441+
"cell_type": "code",
2442+
"execution_count": 22,
2443+
"id": "3d872be3",
2444+
"metadata": {
2445+
"ExecuteTime": {
2446+
"end_time": "2022-01-17T19:59:44.156468Z",
2447+
"start_time": "2022-01-17T19:59:43.671793Z"
2448+
}
2449+
},
2450+
"outputs": [],
2451+
"source": [
2452+
"with open('kingsford_gzip_spring.pkl', 'rb') as f:\n",
2453+
" gzip, read_length, num_bp, gzip_size, fasta_size, spring_size = pickle.load(f)\n",
2454+
"\n",
2455+
"# gzip = !ls ~/metagenome/data/kingsford/compressed/*_no_header.fasta.gz\n",
2456+
"\n",
2457+
"# num_bp = []\n",
2458+
"# read_length = []\n",
2459+
"# spring_size = []\n",
2460+
"# gzip_size = []\n",
2461+
"# fasta_size = []\n",
2462+
"\n",
2463+
"# for x in tqdm(gzip, mininterval=1):\n",
2464+
"# gzip_size.append(get_size(x))\n",
2465+
"# fasta_size.append(get_size(x[:-len('.gz')]))\n",
2466+
"# ID = x[:-len('_no_header.fasta.gz')]\n",
2467+
"# spring_fname = f'{ID}_no_header.spring'\n",
2468+
"# spring_size.append(get_size(spring_fname))\n",
2469+
"# SRA = ID.split('/')[-1]\n",
2470+
"# fname = x[:-len('_no_header.fasta.gz')] + '.num_bp'\n",
2471+
"# with open(fname, 'r') as f:\n",
2472+
"# n = int(f.readline())\n",
2473+
"# num_bp.append(n)\n",
2474+
"\n",
2475+
"# with gz.open(x, 'r') as f:\n",
2476+
"# f.readline().strip().decode()\n",
2477+
"# seq = f.readline().strip().decode()\n",
2478+
"# L = len(seq)\n",
2479+
"# all_same_len = True\n",
2480+
"# for i in range(1000): # check 1000 first reads\n",
2481+
"# f.readline().strip().decode()\n",
2482+
"# if L != len(f.readline().strip().decode()):\n",
2483+
"# all_same_len = False\n",
2484+
"# break\n",
2485+
"# read_length.append(L if all_same_len and num_bp[-1] % L == 0 else np.nan)\n",
2486+
"\n",
2487+
"# gzip = np.array(gzip)\n",
2488+
"# num_bp = np.array(num_bp)\n",
2489+
"# spring_size = np.array(spring_size)\n",
2490+
"# gzip_size = np.array(gzip_size)\n",
2491+
"# fasta_size = np.array(fasta_size)\n",
2492+
"# read_length = np.array(read_length)\n",
2493+
"\n",
2494+
"# with open('kingsford_gzip_spring.pkl', 'wb') as f:\n",
2495+
"# pickle.dump((gzip, read_length, num_bp, gzip_size, fasta_size, spring_size), f)"
2496+
]
2497+
},
2498+
{
2499+
"cell_type": "code",
2500+
"execution_count": 23,
2501+
"id": "4ffcdd65",
2502+
"metadata": {
2503+
"ExecuteTime": {
2504+
"end_time": "2022-01-17T19:59:46.692236Z",
2505+
"start_time": "2022-01-17T19:59:46.667700Z"
2506+
},
2507+
"scrolled": false
2508+
},
2509+
"outputs": [
2510+
{
2511+
"data": {
2512+
"text/plain": [
2513+
"(2652, 7.973392066664)"
2514+
]
2515+
},
2516+
"execution_count": 23,
2517+
"metadata": {},
2518+
"output_type": "execute_result"
2519+
}
2520+
],
2521+
"source": [
2522+
"len(num_bp), sum(num_bp) / 1e12"
2523+
]
2524+
},
2525+
{
2526+
"cell_type": "code",
2527+
"execution_count": 24,
2528+
"id": "cae57556",
2529+
"metadata": {
2530+
"ExecuteTime": {
2531+
"end_time": "2022-01-17T19:59:49.473344Z",
2532+
"start_time": "2022-01-17T19:59:49.448238Z"
2533+
}
2534+
},
2535+
"outputs": [],
2536+
"source": [
2537+
"with open('kingsford_mtg_size.pkl', 'rb') as f:\n",
2538+
" mtg_size = pickle.load(f)\n",
2539+
"\n",
2540+
"# mtg_size = []\n",
2541+
"\n",
2542+
"# DIR = !echo $HOME\n",
2543+
"# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates/'\n",
2544+
"\n",
2545+
"# for x in tqdm(gzip, mininterval=1):\n",
2546+
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
2547+
"# graph_size = get_size(f'{DIR}/{ID}.fasta.gz/graph_small.dbg')\n",
2548+
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n",
2549+
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n",
2550+
"# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n",
2551+
"# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n",
2552+
"# mtg_size.append(graph_size + get_size(column_fname) + get_size(coords_fname))\n",
2553+
"\n",
2554+
"# mtg_size = np.array(mtg_size)\n",
2555+
"\n",
2556+
"# with open('kingsford_mtg_size.pkl', 'wb') as f:\n",
2557+
"# pickle.dump(mtg_size, f)"
2558+
]
2559+
},
2560+
{
2561+
"cell_type": "code",
2562+
"execution_count": 46,
2563+
"id": "09188d87",
2564+
"metadata": {
2565+
"ExecuteTime": {
2566+
"end_time": "2022-01-17T20:08:08.444781Z",
2567+
"start_time": "2022-01-17T20:08:08.421148Z"
2568+
},
2569+
"scrolled": true
2570+
},
2571+
"outputs": [],
2572+
"source": [
2573+
"with open('kingsford_mtg_2_size.pkl', 'rb') as f:\n",
2574+
" mtg2_size = pickle.load(f)\n",
2575+
"\n",
2576+
"# mtg2_size = []\n",
2577+
"\n",
2578+
"# DIR = !echo $HOME\n",
2579+
"# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/kingsford_31_coordinates_fork_opt_new'\n",
2580+
"\n",
2581+
"# for x in tqdm(gzip, mininterval=1):\n",
2582+
"# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n",
2583+
"# graph_size = get_size(f'{DIR}/../kingsford_31_coordinates/{ID}.fasta.gz/graph_small.dbg')\n",
2584+
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n",
17772585
"# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n",
17782586
"# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n",
17792587
"# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n",
@@ -2364,6 +3172,394 @@
23643172
"plt.savefig('size_vs_k.pdf', format='pdf', bbox_inches='tight')\n",
23653173
"plt.show()"
23663174
]
3175+
},
3176+
{
3177+
"cell_type": "markdown",
3178+
"id": "e098b928",
3179+
"metadata": {},
3180+
"source": [
3181+
"## Construction time"
3182+
]
3183+
},
3184+
{
3185+
"cell_type": "code",
3186+
"execution_count": 37,
3187+
"id": "041a2243",
3188+
"metadata": {
3189+
"ExecuteTime": {
3190+
"end_time": "2022-03-07T18:06:34.192082Z",
3191+
"start_time": "2022-03-07T18:06:33.727756Z"
3192+
},
3193+
"scrolled": false
3194+
},
3195+
"outputs": [
3196+
{
3197+
"name": "stdout",
3198+
"output_type": "stream",
3199+
"text": [
3200+
"Binary OLD:\n",
3201+
"KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n",
3202+
"Build contigs: 24.58 h, in 5304 runs, each using 1 cores\tNormalized time: 0.683 h \tmem: 50.69142 GB\n",
3203+
"Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n",
3204+
"Annotate graph: 1.11 h, in 10 runs, each using 18 cores\t\tNormalized time: 0.555 h \tmem: 51.589428 GB\n",
3205+
"Transform anno: 1.34 h, in 4 runs, each using 36 cores\t\tNormalized time: 1.337 h \tmem: 58.220948 GB\n"
3206+
]
3207+
}
3208+
],
3209+
"source": [
3210+
"print('Binary OLD:')\n",
3211+
"\n",
3212+
"cores = 1\n",
3213+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n",
3214+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n",
3215+
"print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3216+
"\n",
3217+
"cores = 1\n",
3218+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_single.lsf | cut -d' ' -f8\n",
3219+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n",
3220+
"print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3221+
"\n",
3222+
"cores = 36\n",
3223+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n",
3224+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n",
3225+
"print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3226+
"\n",
3227+
"cores = 18\n",
3228+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | cut -d' ' -f8\n",
3229+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n",
3230+
"print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3231+
"\n",
3232+
"cores = 36\n",
3233+
"time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/logs/old_rd_* | cut -d' ' -f8\n",
3234+
"mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/logs/old_rd_* | tr -s ' ' | cut -f6 -d' '\n",
3235+
"print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3236+
]
3237+
},
3238+
{
3239+
"cell_type": "code",
3240+
"execution_count": 38,
3241+
"id": "31ea264e",
3242+
"metadata": {
3243+
"ExecuteTime": {
3244+
"end_time": "2022-03-07T18:06:50.077412Z",
3245+
"start_time": "2022-03-07T18:06:49.740832Z"
3246+
}
3247+
},
3248+
"outputs": [
3249+
{
3250+
"name": "stdout",
3251+
"output_type": "stream",
3252+
"text": [
3253+
"Binary:\n",
3254+
"KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n",
3255+
"Build contigs: 24.58 h, in 5304 runs, each using 1 cores\tNormalized time: 0.683 h \tmem: 50.69142 GB\n",
3256+
"Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n",
3257+
"Annotate graph: 1.11 h, in 10 runs, each using 18 cores\t\tNormalized time: 0.555 h \tmem: 51.589428 GB\n",
3258+
"Transform anno: 1.19 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.191 h \tmem: 59.31488 GB\n"
3259+
]
3260+
}
3261+
],
3262+
"source": [
3263+
"print('Binary:')\n",
3264+
"\n",
3265+
"cores = 1\n",
3266+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n",
3267+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n",
3268+
"print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3269+
"\n",
3270+
"cores = 1\n",
3271+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_single.lsf | cut -d' ' -f8\n",
3272+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n",
3273+
"print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3274+
"\n",
3275+
"cores = 36\n",
3276+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n",
3277+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n",
3278+
"print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3279+
"\n",
3280+
"cores = 18\n",
3281+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | cut -d' ' -f8\n",
3282+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n",
3283+
"print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3284+
"\n",
3285+
"cores = 36\n",
3286+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/rd_brwt.lsf | cut -d' ' -f8\n",
3287+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n",
3288+
"print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3289+
]
3290+
},
3291+
{
3292+
"cell_type": "code",
3293+
"execution_count": 39,
3294+
"id": "609cb85a",
3295+
"metadata": {
3296+
"ExecuteTime": {
3297+
"end_time": "2022-03-07T18:07:12.041814Z",
3298+
"start_time": "2022-03-07T18:07:10.691302Z"
3299+
}
3300+
},
3301+
"outputs": [
3302+
{
3303+
"name": "stdout",
3304+
"output_type": "stream",
3305+
"text": [
3306+
"Smooth counts:\n",
3307+
"KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n",
3308+
"Build contigs: 31.10 h, in 5304 runs, each using 1 cores\tNormalized time: 0.864 h \tmem: 70.273636 GB\n",
3309+
"Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n",
3310+
"Annotate graph: 2.16 h, in 10 runs, each using 18 cores\t\tNormalized time: 1.080 h \tmem: 51.294724 GB\n",
3311+
"Transform anno: 1.26 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.256 h \tmem: 87.648508 GB\n"
3312+
]
3313+
}
3314+
],
3315+
"source": [
3316+
"print('Smooth counts:')\n",
3317+
"\n",
3318+
"cores = 1\n",
3319+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n",
3320+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n",
3321+
"print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3322+
"\n",
3323+
"cores = 1\n",
3324+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/build_single.lsf | cut -d' ' -f8\n",
3325+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n",
3326+
"print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3327+
"\n",
3328+
"cores = 36\n",
3329+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n",
3330+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n",
3331+
"print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3332+
"\n",
3333+
"cores = 18\n",
3334+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/annotate_*.lsf | cut -d' ' -f8\n",
3335+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n",
3336+
"print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3337+
"\n",
3338+
"cores = 36\n",
3339+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/count_rd_brwt.lsf | cut -d' ' -f8\n",
3340+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/count_rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n",
3341+
"print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3342+
]
3343+
},
3344+
{
3345+
"cell_type": "code",
3346+
"execution_count": 40,
3347+
"id": "9d9db1da",
3348+
"metadata": {
3349+
"ExecuteTime": {
3350+
"end_time": "2022-03-07T18:07:40.094671Z",
3351+
"start_time": "2022-03-07T18:07:38.708276Z"
3352+
}
3353+
},
3354+
"outputs": [
3355+
{
3356+
"name": "stdout",
3357+
"output_type": "stream",
3358+
"text": [
3359+
"Raw counts:\n",
3360+
"KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n",
3361+
"Build contigs: 29.86 h, in 5304 runs, each using 1 cores\tNormalized time: 0.829 h \tmem: 70.3226 GB\n",
3362+
"Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n",
3363+
"Annotate graph: 2.36 h, in 10 runs, each using 18 cores\t\tNormalized time: 1.179 h \tmem: 51.318896 GB\n",
3364+
"Transform anno: 1.40 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.403 h \tmem: 87.64346 GB\n"
3365+
]
3366+
}
3367+
],
3368+
"source": [
3369+
"print('Raw counts:')\n",
3370+
"\n",
3371+
"cores = 1\n",
3372+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n",
3373+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n",
3374+
"print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3375+
"\n",
3376+
"cores = 1\n",
3377+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/build_single.lsf | cut -d' ' -f8\n",
3378+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n",
3379+
"print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3380+
"\n",
3381+
"cores = 36\n",
3382+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n",
3383+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n",
3384+
"print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3385+
"\n",
3386+
"cores = 18\n",
3387+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/annotate_*.lsf | cut -d' ' -f8\n",
3388+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n",
3389+
"print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n",
3390+
"\n",
3391+
"cores = 36\n",
3392+
"time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/count_rd_brwt.lsf | cut -d' ' -f8\n",
3393+
"mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/count_rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n",
3394+
"print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3395+
]
3396+
},
3397+
{
3398+
"cell_type": "code",
3399+
"execution_count": 107,
3400+
"id": "c0c15010",
3401+
"metadata": {
3402+
"ExecuteTime": {
3403+
"end_time": "2022-02-01T11:42:34.923564Z",
3404+
"start_time": "2022-02-01T11:41:36.035222Z"
3405+
}
3406+
},
3407+
"outputs": [
3408+
{
3409+
"name": "stdout",
3410+
"output_type": "stream",
3411+
"text": [
3412+
"ntCard estimation (for Mantis): 72.77 h, in 2652 runs, each using 1 cores\tNormalized time: 2.021 h \tmem: 19.002412 GB\n"
3413+
]
3414+
}
3415+
],
3416+
"source": [
3417+
"cores = 1\n",
3418+
"time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.ntcard.4.log | cut -d' ' -f8\n",
3419+
"mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.ntcard.4.log | tr -s ' ' | cut -f6 -d' '\n",
3420+
"print(f'ntCard estimation (for Mantis): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n",
3421+
" f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3422+
]
3423+
},
3424+
{
3425+
"cell_type": "code",
3426+
"execution_count": 109,
3427+
"id": "4963e7de",
3428+
"metadata": {
3429+
"ExecuteTime": {
3430+
"end_time": "2022-02-01T12:11:24.741680Z",
3431+
"start_time": "2022-02-01T12:11:14.463792Z"
3432+
}
3433+
},
3434+
"outputs": [
3435+
{
3436+
"name": "stdout",
3437+
"output_type": "stream",
3438+
"text": [
3439+
"CQF counting (for Mantis): 1274.37 h, in 2652 runs, each using 1 cores\tNormalized time: 35.399 h \tmem: 333.179748 GB\n"
3440+
]
3441+
}
3442+
],
3443+
"source": [
3444+
"cores = 1\n",
3445+
"time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.4.stderr.log | cut -d' ' -f8\n",
3446+
"mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.4.stderr.log | tr -s ' ' | cut -f6 -d' '\n",
3447+
"print(f'CQF counting (for Mantis): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n",
3448+
" f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3449+
]
3450+
},
3451+
{
3452+
"cell_type": "code",
3453+
"execution_count": 110,
3454+
"id": "bb2dc09c",
3455+
"metadata": {
3456+
"ExecuteTime": {
3457+
"end_time": "2022-02-01T12:11:42.846687Z",
3458+
"start_time": "2022-02-01T12:11:38.921845Z"
3459+
}
3460+
},
3461+
"outputs": [
3462+
{
3463+
"name": "stdout",
3464+
"output_type": "stream",
3465+
"text": [
3466+
"CQF counting (for Mantis) with 1 thread: 695.35 h, in 2652 runs, each using 1 cores\tNormalized time: 19.315 h \tmem: 482.870516 GB\n"
3467+
]
3468+
}
3469+
],
3470+
"source": [
3471+
"cores = 1\n",
3472+
"time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.2.stderr.log | cut -d' ' -f8\n",
3473+
"mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.2.stderr.log | tr -s ' ' | cut -f6 -d' '\n",
3474+
"print(f'CQF counting (for Mantis) with 1 thread: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n",
3475+
" f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3476+
]
3477+
},
3478+
{
3479+
"cell_type": "code",
3480+
"execution_count": 41,
3481+
"id": "67dcc514",
3482+
"metadata": {
3483+
"ExecuteTime": {
3484+
"end_time": "2022-03-08T09:13:40.212561Z",
3485+
"start_time": "2022-03-08T09:13:24.120477Z"
3486+
}
3487+
},
3488+
"outputs": [
3489+
{
3490+
"name": "stdout",
3491+
"output_type": "stream",
3492+
"text": [
3493+
"BCALM (for REINDEER): 1462.83 h, in 2652 runs, each using 1 cores\tNormalized time: 40.634 h \tmem: 152.297732 GB\n"
3494+
]
3495+
}
3496+
],
3497+
"source": [
3498+
"cores = 1\n",
3499+
"time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log | cut -d' ' -f8\n",
3500+
"mem = !grep 'Maximum resident' /cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log | tr -s ' ' | cut -f6 -d' '\n",
3501+
"print(f'BCALM (for REINDEER): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n",
3502+
" f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')"
3503+
]
3504+
},
3505+
{
3506+
"cell_type": "code",
3507+
"execution_count": 53,
3508+
"id": "e08d010c",
3509+
"metadata": {
3510+
"ExecuteTime": {
3511+
"end_time": "2022-03-08T09:31:28.410415Z",
3512+
"start_time": "2022-03-08T09:31:27.312138Z"
3513+
}
3514+
},
3515+
"outputs": [
3516+
{
3517+
"data": {
3518+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEUCAYAAAA2ib1OAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPQUlEQVR4nO3dT1IbWdbG4fd01BwFLqY0ofIKhJh+k5ZHFZ6J9gosdoBrB114B+AVUDBqR41Er8DADqyoHnc3JVZwvkHexOm0/qRkpVIH/Z6ICqRUpvJIBa+uTuZNm7sLABDXX5ouAADwfQhyAAiOIAeA4AhyAAiOIAeA4AhyrIyZ/WlmQzO7MrM7MzubsE4nrXOXfp6WHv88Zbs/zaw1YflnM7uasPzMzFZ+Slap/q9eY5XXX1i3ZWZ36XbbzD4vUMNC6+P5I8ixasfufuzuh1IWqPkDZtaRdJXWOXT3V5KuC4+3JY0k9RfcZ3vCsp6kcdUnSME6KH+wlNbJ6z9J9R9KGpZWm/r6p3H3kbv/VLXW8vrpfcMWI8hRp0t9HbIflIXgOF/g7qPC4yeSzqWn0Ky8HzN7Cv+07c2CtZ5JOp6zTl7/U83uPms/5ddfl2++kWC7EOSoRQrTD5L+UVjcnhN8fXe/VjZKf7PA7srrv1EWopW5+4lmBGJq68yrv7j+pNc/9bkLbZZW3pYp/DzN2zkT1j+TlLd7pn6bwPP2Q9MF4Nm5MjMpG4leu/u99PT1/2HaRoW2ipSF8L8kvauyQ3cfldoLPXd/l+pYlZn1F0x8/QvqSPpbeg2urF3zKoV1R1/eJ6V1eqlNhS3FiByrduzur1IP9+lAZGpH7M7Y7kSp35zCr7VMe2XJtkoVIxXaJGlfwwkHVCe+/kX3VWg/jfTl9cx7D7GlCHLUxt0vlB10zI3MrDdl9b6ko3TGx5Wy0FqmvbJwW6WKFKz3ef3ufp1GweMZ25Rff1VfPWfxmAIwCa0V1CaFXrEd8VZZ6+EwD6c0gh5LGrv7cWHbjpZrr7TcvdI2S/im/lkmvP7amFmLwN9eBDlW7UPqEbfS/aferbvfm9mxsjDMWwSXkl4ona1SWvfBzDqFPvO/zCwPxlE6QFl0XtjvV9J511MD2MzOlUbPZnZU/FCZU/9vpdWmvv4a3Sh7b24nvCfYAsZlbLENzOyckMNzRZBjK5hZu3TOOvBsEOQAEBxnrQBAcAQ5AARHkANAcGs//fDHH3/0g4ODde8WAEK7u7v7r7vvTXps7UF+cHCg29vbde8WAEIzs39Pe4zWCgAER5ADQHAEOQAEt7YgN7PXZnbx+Pi4rl0CwFZYW5C7+0d3H+zs7KxrlwCwFWitAEBwBDkABEeQA0Bw/MMSFR388nsj+/3j158b2S+AOBiRA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABLeSIDeztpl1zOzUzFqreE4AQDWrGpE/SBql27srek4AQAVzg9zMemY2LC3rp+WnkuTu45rqAwDMMTfI3f2meN/M+oXl4xTogxTmN5L6dRQKAJhsmdbKkb60UUaSOpJuzawjqSfpYkW1AQAqWOZ65K3S/Rfufp9u32sCMxtIGkjS/v7+ErsEAEyzzIh8rAUPaLr7hbt33b27t7e3xC4BANMsE+Sf9GVU3pY0nL4qAKBuVc5a6UvqFg5yXktqm1kv3b+ZtT0AoF5ze+QpuK9Ly94vuiMzey3p9cuXLxfdFAAww9qm6Lv7R3cf7OzsrGuXALAVuNYKAAS3tiA3s9dmdvH4+LiuXQLAVqC1AgDB0VoBgOCWmdnZmINffm+6BADYOPTIASA4euQAEBw9cgAIjiAHgOAIcgAIjoOdABAcBzsBIDhaKwAQHEEOAMER5AAQHEEOAMFx1goABMdZKwAQHK0VAAiOIAeA4AhyAAiOIAeA4AhyAAiO0w8BIDhOPwSA4GitAEBwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwTAgCgOCYEAQAwdFaAYDgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DguGgWAATHRbMAIDhaKwAQ3A9NF4DZDn75vbF9//Hrz43tG0B1jMgBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCW8lFs8ysLaklqSfp2t1Hq3heAMB8qxqRdySNJN1I6q/oOQEAFcwNcjPrmdmwtKyflp9Kkrtfu/tYaUReS6UAgInmBrm73xTvm1m/sHxsZr20PA/xhxrqBABMsUyP/EjSZbo9ktQxM0l6l+4PxagcANZmmSBvle6/SKPzmwnrSpLMbCBpIEn7+/tL7BIAMM0yBzvHknYX2cDdL9y96+7dvb29JXYJAJhmmSD/pC+j8rayVgoAoCFVzlrpS+oWDnJeS2rnBznLB0NnPM9rM7t4fHz8nnoBACVze+QpuK9Ly94vuiN3/yjpY7fbfbvotgCA6ZiiDwDBEeQAENzagpweOQDUY21B7u4f3X2ws7Ozrl0CwFagtQIAwRHkABAcPXIACI4eOQAER2sFAIIjyAEgOIIcAILjYCcABMfBTgAIjtYKAARHkANAcAQ5AARHkANAcJy1AgDBcdYKAARHawUAgiPIASA4ghwAgiPIASA4ghwAguP0QwAIjtMPASA4WisAEBxBDgDBEeQAEBxBDgDBEeQAEBxBDgDBEeQAEBwTggAgOCYEAUBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACC41orABAc11oBgOBorQBAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcCsLcjMbmFlvVc8HAKhmlSPyW0mtFT4fAKCCuUFuZj0zG5aW9dPy0/pKAwBUMTfI3f2meN/M+oXlY9opANCsZVorR5JG6fZIUifd7kk6MrPWCuoCAFT0wxLbtEr3X0iSu7+ftoGZDSQNJGl/f3+JXQIApllmRD6WtLvIBu5+4e5dd+/u7e0tsUsAwDTLBPknfRmVtyUNp68KAKjb3NZKOrjZNbO+u1+7+7WZneYHOcsHQ2c8z2tJr1++fPl9FWNtDn75vZH9/vHrz43sF4hqbpC7+7Wk69Kyqf3wGc/zUdLHbrf7dtFtAQDTMUUfAIIjyAEguLUFuZm9NrOLx8fHde0SALbC2oLc3T+6+2BnZ2dduwSArUBrBQCCI8gBILhlpugvhfPIgc3T1FwBifkCq0SPHACCo7UCAMER5AAQHEEOAMExIQgAguNgJwAER2sFAIIjyAEgOIIcAIIjyAEgOKboY+MwbXw78E8Jrg5nrQBAcLRWACA4ghwAgiPIASA4ghwAguOsFaCAMykQEWetAEBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDgmBAEboMlL926b53iZZCYEAUBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCM3df7w7N/iPp3wts8qOk/9ZUTh2ot17UWy/qrdf31PtXd9+b9MDag3xRZnbr7t2m66iKeutFvfWi3nrVVS+tFQAIjiAHgOAiBPlF0wUsiHrrRb31ot561VLvxvfIAQCzRRiRAwBmIMgBILiNDnIz65tZz8xOm66lilTrsOk6qjCzVnp/+2Z21nQ986T3theh1qIo9ZrZn2Z2F6jeTv7723Qt86RaP6f3t5b3eGODPP8f5O43ksZm1mu4pLlSrVH8XdKuu19LkpkNGq5nKjPrSHqV3t+OmbWbrqmK9DsbolZJx+5+6O7vmi6kopP0u9sO8Puw6+4/ufuhpLeSzle9g40NcklHkkbp9khSp8Fanh13v3D3/Ah6W9LGfgi5+727vzOzlqSRu4/mbdO0FC4bX2dBK0AgSnoadNyZWdvd32/670NpgNeuo95NDvJW6f6LJop47tIf78Om/zEkXUnjpouoqJY/2BrtSnows5WPFmvwU/rvwczO0wf8xjOzQf4NeNU2OcjHyn65UK++u580XUQVaWTT2vS+qJn1grXZ8m9oY2VtzI1+f5PPqd47SRvbFix5VdcTb3KQf9KXUXlbUoiDiJGYWd/d36fbG3sMwszOCj38sTb/A/4hHZjtK+vhbnRb0MwGhfD+X6PFVPOpcLulAN/S6v7WsLFBXjiQ0Uv3N36Ek/4YuhFGNOl9PcuPpDddzxznkkap5laht7+RUk//RtkHTqvhcqr4TYUTCur6+r8qqb5Wod6N/n1IdiU91PXkzOwEgOA2dkQOAKiGIAeA4AhyAAiOIMdapangQzO7Sj8nnrecpjR/M5U5Lb+asPzMzEIc8ElTtoeTpmyX3p8wU+bRLIIcTTh292N3f6Vsht5XYV6YFTnt7J9JMxB7inEaWkfSlbIp5odp2nb51Nr8/TlM2xDmmIkgR9NulM3YLDpRuh7FlHOwL4uneKZ1Nv701OSDshB/mvU559TaS8W5XgsaQpCjMWmSxJm+vYhQP50rfC3pzYRNy8vfKAu8ifsonidvZp/Tz3ZqYQzzK1YW7t8VJ3Cktk3+2Hk+OcnMTtO6wyoTPtI67apzItIH1AdJ/6iyPrYXQY4mXKXw/FPSZXFCR+liU5eaMP06jWaLo9Seu98vWMOJpDN3f5VaPCrcPlf2AZPX007L7yVduftF4YqMh/lzVdhnW9UmheTvz5WkmyVeG7YMQY4m5P3xSTMIT5R6xinAWrPaK9/RVhlKOk+j7bb0NEofKLsmRvEyAK3083+F228k7aYDr2f6tj00yVcfQKn+4YSDtMfpQ+UnSRMP7gJFBDma9E7fjmT7ko7SWRtXysJvVntlaltlltTeyC9idFc4CHmrQqsnjf530wj5qDR9/V06KPl0YHLOPseS7otT4dMH2njGNhfKDuQCUxHkaEwKyZtCz7ktaVwIx2NJx5rdXulUaD20CrefRt/uPkr/kMKtpP9T1ua519ej5ray9sarVE/uUtm3h+J6VbxV9k2gNW/F9Lw91XiNDjwPPzRdALbembI2x4UKZ6vk3P3ezB7MbFJgn2vORancfWxmozSivk//SVLPzPJgHkn6p6ShmeUj5Ie0/Sgd1Oyn5Tfu/i7V9XSgNL2G9+lg6mEafU+q5z7t98rM8vbNb6XVPpiZCq+ttsuf4nngolnADOkc7s/pAGdL2Vkkl9OuEGhm51Gu747ng9YKMFtLX0bnY2Wj91nXQ2fyDtaOETkwQ2EU3kqLRoy4sWkIcgAIjtYKAARHkANAcAQ5AARHkANAcAQ5AARHkANAcP8PefBxlxmOJ7EAAAAASUVORK5CYII=\n",
3519+
"text/plain": [
3520+
"<Figure size 432x288 with 1 Axes>"
3521+
]
3522+
},
3523+
"metadata": {
3524+
"needs_background": "light"
3525+
},
3526+
"output_type": "display_data"
3527+
}
3528+
],
3529+
"source": [
3530+
"plt.hist(np.array(mem).astype(int) / 1e6)\n",
3531+
"plt.yscale('log')\n",
3532+
"plt.title('BCALM, 1 GB limit')\n",
3533+
"plt.xlabel('RAM usage, GB')\n",
3534+
"plt.show()"
3535+
]
3536+
},
3537+
{
3538+
"cell_type": "code",
3539+
"execution_count": 61,
3540+
"id": "2be4c3ca",
3541+
"metadata": {
3542+
"ExecuteTime": {
3543+
"end_time": "2022-03-08T11:09:15.458807Z",
3544+
"start_time": "2022-03-08T11:09:14.562628Z"
3545+
}
3546+
},
3547+
"outputs": [
3548+
{
3549+
"data": {
3550+
"text/plain": [
3551+
"'/cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/SRR563547_build_k21.try4.stderr.log'"
3552+
]
3553+
},
3554+
"execution_count": 61,
3555+
"metadata": {},
3556+
"output_type": "execute_result"
3557+
}
3558+
],
3559+
"source": [
3560+
"ids = !ls /cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log\n",
3561+
"ids[np.argmax(np.array(mem).astype(int))]"
3562+
]
23673563
}
23683564
],
23693565
"metadata": {
@@ -2386,7 +3582,10 @@
23863582
},
23873583
"toc": {
23883584
"base_numbering": 1,
2389-
"nav_menu": {},
3585+
"nav_menu": {
3586+
"height": "191px",
3587+
"width": "221px"
3588+
},
23903589
"number_sections": true,
23913590
"sideBar": true,
23923591
"skip_h1_title": true,
@@ -2397,7 +3596,7 @@
23973596
"height": "calc(100% - 180px)",
23983597
"left": "10px",
23993598
"top": "150px",
2400-
"width": "219px"
3599+
"width": "255px"
24013600
},
24023601
"toc_section_display": true,
24033602
"toc_window_display": true

0 commit comments

Comments
 (0)
Please sign in to comment.