|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": 44, |
6 | 6 | "id": "addeddb1",
|
7 | 7 | "metadata": {
|
8 | 8 | "ExecuteTime": {
|
9 |
| - "end_time": "2021-11-09T14:14:19.548112Z", |
10 |
| - "start_time": "2021-11-09T14:14:18.859022Z" |
| 9 | + "end_time": "2022-04-22T13:36:26.454251Z", |
| 10 | + "start_time": "2022-04-22T13:36:26.361572Z" |
11 | 11 | }
|
12 | 12 | },
|
13 |
| - "outputs": [], |
| 13 | + "outputs": [ |
| 14 | + { |
| 15 | + "name": "stdout", |
| 16 | + "output_type": "stream", |
| 17 | + "text": [ |
| 18 | + "The autoreload extension is already loaded. To reload it, use:\n", |
| 19 | + " %reload_ext autoreload\n" |
| 20 | + ] |
| 21 | + } |
| 22 | + ], |
14 | 23 | "source": [
|
15 | 24 | "import os\n",
|
16 | 25 | "import numpy as np\n",
|
|
54 | 63 | "%autoreload 2"
|
55 | 64 | ]
|
56 | 65 | },
|
| 66 | + { |
| 67 | + "cell_type": "code", |
| 68 | + "execution_count": 45, |
| 69 | + "id": "1159b6aa", |
| 70 | + "metadata": { |
| 71 | + "ExecuteTime": { |
| 72 | + "end_time": "2022-04-22T13:36:29.398161Z", |
| 73 | + "start_time": "2022-04-22T13:36:29.374391Z" |
| 74 | + } |
| 75 | + }, |
| 76 | + "outputs": [], |
| 77 | + "source": [ |
| 78 | + "def to_seconds(s):\n", |
| 79 | + " parts = s.split(':')\n", |
| 80 | + " assert len(parts) <= 3\n", |
| 81 | + " time = 0.\n", |
| 82 | + " for p in parts:\n", |
| 83 | + " time = time * 60 + float(p)\n", |
| 84 | + " return time" |
| 85 | + ] |
| 86 | + }, |
57 | 87 | {
|
58 | 88 | "cell_type": "markdown",
|
59 | 89 | "id": "aa49ed37",
|
|
74 | 104 | },
|
75 | 105 | {
|
76 | 106 | "cell_type": "code",
|
77 |
| - "execution_count": 2, |
| 107 | + "execution_count": 46, |
78 | 108 | "id": "9f285dd6",
|
79 | 109 | "metadata": {
|
80 | 110 | "ExecuteTime": {
|
81 |
| - "end_time": "2021-11-09T14:14:19.565071Z", |
82 |
| - "start_time": "2021-11-09T14:14:19.550186Z" |
| 111 | + "end_time": "2022-04-22T13:36:30.200207Z", |
| 112 | + "start_time": "2022-04-22T13:36:30.177042Z" |
83 | 113 | }
|
84 | 114 | },
|
85 | 115 | "outputs": [],
|
|
101 | 131 | },
|
102 | 132 | {
|
103 | 133 | "cell_type": "code",
|
104 |
| - "execution_count": 3, |
| 134 | + "execution_count": 12, |
105 | 135 | "id": "aa0d8c90",
|
106 | 136 | "metadata": {
|
107 | 137 | "ExecuteTime": {
|
108 |
| - "end_time": "2021-11-09T14:14:19.623819Z", |
109 |
| - "start_time": "2021-11-09T14:14:19.568014Z" |
| 138 | + "end_time": "2022-04-01T22:36:17.657643Z", |
| 139 | + "start_time": "2022-04-01T22:36:17.546620Z" |
110 | 140 | },
|
111 | 141 | "scrolled": false
|
112 | 142 | },
|
|
135 | 165 | },
|
136 | 166 | {
|
137 | 167 | "cell_type": "code",
|
138 |
| - "execution_count": 4, |
| 168 | + "execution_count": 13, |
139 | 169 | "id": "ae45466d",
|
140 | 170 | "metadata": {
|
141 | 171 | "ExecuteTime": {
|
142 |
| - "end_time": "2021-11-09T14:14:19.643938Z", |
143 |
| - "start_time": "2021-11-09T14:14:19.625845Z" |
| 172 | + "end_time": "2022-04-01T22:36:18.374915Z", |
| 173 | + "start_time": "2022-04-01T22:36:18.336702Z" |
144 | 174 | }
|
145 | 175 | },
|
146 | 176 | "outputs": [],
|
|
164 | 194 | },
|
165 | 195 | {
|
166 | 196 | "cell_type": "code",
|
167 |
| - "execution_count": 5, |
| 197 | + "execution_count": 14, |
168 | 198 | "id": "c82fff4b",
|
169 | 199 | "metadata": {
|
170 | 200 | "ExecuteTime": {
|
171 |
| - "end_time": "2021-11-09T14:14:19.661601Z", |
172 |
| - "start_time": "2021-11-09T14:14:19.645460Z" |
| 201 | + "end_time": "2022-04-01T22:36:18.922091Z", |
| 202 | + "start_time": "2022-04-01T22:36:18.883641Z" |
173 | 203 | }
|
174 | 204 | },
|
175 | 205 | "outputs": [],
|
|
193 | 223 | },
|
194 | 224 | {
|
195 | 225 | "cell_type": "code",
|
196 |
| - "execution_count": 6, |
| 226 | + "execution_count": 15, |
197 | 227 | "id": "f5cd92dc",
|
198 | 228 | "metadata": {
|
199 | 229 | "ExecuteTime": {
|
200 |
| - "end_time": "2021-11-09T14:14:19.678527Z", |
201 |
| - "start_time": "2021-11-09T14:14:19.663047Z" |
| 230 | + "end_time": "2022-04-01T22:36:19.450515Z", |
| 231 | + "start_time": "2022-04-01T22:36:19.414664Z" |
202 | 232 | }
|
203 | 233 | },
|
204 | 234 | "outputs": [],
|
|
225 | 255 | },
|
226 | 256 | {
|
227 | 257 | "cell_type": "code",
|
228 |
| - "execution_count": 7, |
| 258 | + "execution_count": 16, |
| 259 | + "id": "831d9053", |
| 260 | + "metadata": { |
| 261 | + "ExecuteTime": { |
| 262 | + "end_time": "2022-04-01T22:36:20.161981Z", |
| 263 | + "start_time": "2022-04-01T22:36:20.125833Z" |
| 264 | + } |
| 265 | + }, |
| 266 | + "outputs": [], |
| 267 | + "source": [ |
| 268 | + "with open('mtg_size_graph.pkl', 'rb') as f:\n", |
| 269 | + " mtg_size_graph = pickle.load(f)\n", |
| 270 | + "\n", |
| 271 | + "# mtg_size_graph = []\n", |
| 272 | + "\n", |
| 273 | + "# DIR = !echo $HOME\n", |
| 274 | + "# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg'\n", |
| 275 | + "\n", |
| 276 | + "# for x in gzip:\n", |
| 277 | + "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
| 278 | + "# mtg_size_graph.append(get_size(f'{DIR}/{ID[:6]}/{ID}/graph_small.dbg'))\n", |
| 279 | + "\n", |
| 280 | + "# mtg_size_graph = np.array(mtg_size_graph)\n", |
| 281 | + "\n", |
| 282 | + "# with open('mtg_size_graph.pkl', 'wb') as f:\n", |
| 283 | + "# pickle.dump(mtg_size_graph, f)" |
| 284 | + ] |
| 285 | + }, |
| 286 | + { |
| 287 | + "cell_type": "code", |
| 288 | + "execution_count": 17, |
| 289 | + "id": "a1a2acbd", |
| 290 | + "metadata": { |
| 291 | + "ExecuteTime": { |
| 292 | + "end_time": "2022-04-01T22:36:20.778732Z", |
| 293 | + "start_time": "2022-04-01T22:36:20.741770Z" |
| 294 | + } |
| 295 | + }, |
| 296 | + "outputs": [], |
| 297 | + "source": [ |
| 298 | + "with open('mtg_size_anno.pkl', 'rb') as f:\n", |
| 299 | + " mtg_size_anno = pickle.load(f)\n", |
| 300 | + "\n", |
| 301 | + "# mtg_size_anno = []\n", |
| 302 | + "\n", |
| 303 | + "# DIR = !echo $HOME\n", |
| 304 | + "# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg'\n", |
| 305 | + "\n", |
| 306 | + "# for x in gzip:\n", |
| 307 | + "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
| 308 | + "# mtg_size_anno.append(get_size(f'{DIR}/{ID[:6]}/{ID}/annotation.row_diff_coord.annodbg'))\n", |
| 309 | + "\n", |
| 310 | + "# mtg_size_anno = np.array(mtg_size_anno)\n", |
| 311 | + "\n", |
| 312 | + "# with open('mtg_size_anno.pkl', 'wb') as f:\n", |
| 313 | + "# pickle.dump(mtg_size_anno, f)" |
| 314 | + ] |
| 315 | + }, |
| 316 | + { |
| 317 | + "cell_type": "code", |
| 318 | + "execution_count": 18, |
| 319 | + "id": "a90f1ce3", |
| 320 | + "metadata": { |
| 321 | + "ExecuteTime": { |
| 322 | + "end_time": "2022-04-01T22:36:21.526690Z", |
| 323 | + "start_time": "2022-04-01T22:36:21.491789Z" |
| 324 | + } |
| 325 | + }, |
| 326 | + "outputs": [], |
| 327 | + "source": [ |
| 328 | + "with open('mtg2_size.pkl', 'rb') as f:\n", |
| 329 | + " mtg2_size = pickle.load(f)\n", |
| 330 | + "\n", |
| 331 | + "# mtg2_size = []\n", |
| 332 | + "\n", |
| 333 | + "# DIR = !echo $HOME\n", |
| 334 | + "# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg_fork_opt'\n", |
| 335 | + "\n", |
| 336 | + "# for x in gzip:\n", |
| 337 | + "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
| 338 | + "# graph_size = get_size(f'{DIR}/{ID[:6]}/{ID}/graph_small.dbg')\n", |
| 339 | + "# anno_size = get_size(f'{DIR}/{ID[:6]}/{ID}/annotation.row_diff_coord.annodbg')\n", |
| 340 | + "# mtg2_size.append(graph_size + anno_size)\n", |
| 341 | + "\n", |
| 342 | + "# mtg2_size = np.array(mtg2_size)\n", |
| 343 | + "\n", |
| 344 | + "# with open('mtg2_size.pkl', 'wb') as f:\n", |
| 345 | + "# pickle.dump(mtg2_size, f)" |
| 346 | + ] |
| 347 | + }, |
| 348 | + { |
| 349 | + "cell_type": "code", |
| 350 | + "execution_count": 19, |
229 | 351 | "id": "48eee7cf",
|
230 | 352 | "metadata": {
|
231 | 353 | "ExecuteTime": {
|
232 |
| - "end_time": "2021-11-09T14:14:19.695733Z", |
233 |
| - "start_time": "2021-11-09T14:14:19.680784Z" |
| 354 | + "end_time": "2022-04-01T22:36:22.121466Z", |
| 355 | + "start_time": "2022-04-01T22:36:22.083854Z" |
234 | 356 | }
|
235 | 357 | },
|
236 | 358 | "outputs": [],
|
|
257 | 379 | },
|
258 | 380 | {
|
259 | 381 | "cell_type": "code",
|
260 |
| - "execution_count": 8, |
| 382 | + "execution_count": 20, |
261 | 383 | "id": "5238f351",
|
262 | 384 | "metadata": {
|
263 | 385 | "ExecuteTime": {
|
264 |
| - "end_time": "2021-11-09T14:14:19.712975Z", |
265 |
| - "start_time": "2021-11-09T14:14:19.697596Z" |
| 386 | + "end_time": "2022-04-01T22:36:22.611761Z", |
| 387 | + "start_time": "2022-04-01T22:36:22.572974Z" |
266 | 388 | }
|
267 | 389 | },
|
268 | 390 | "outputs": [],
|
|
289 | 411 | },
|
290 | 412 | {
|
291 | 413 | "cell_type": "code",
|
292 |
| - "execution_count": 9, |
| 414 | + "execution_count": 21, |
293 | 415 | "id": "9261d1df",
|
294 | 416 | "metadata": {
|
295 | 417 | "ExecuteTime": {
|
296 |
| - "end_time": "2021-11-09T14:14:19.729536Z", |
297 |
| - "start_time": "2021-11-09T14:14:19.714366Z" |
| 418 | + "end_time": "2022-04-01T22:36:23.855207Z", |
| 419 | + "start_time": "2022-04-01T22:36:23.816565Z" |
298 | 420 | }
|
299 | 421 | },
|
300 | 422 | "outputs": [],
|
|
321 | 443 | },
|
322 | 444 | {
|
323 | 445 | "cell_type": "code",
|
324 |
| - "execution_count": 10, |
| 446 | + "execution_count": 22, |
325 | 447 | "id": "07b89508",
|
326 | 448 | "metadata": {
|
327 | 449 | "ExecuteTime": {
|
328 |
| - "end_time": "2021-11-09T14:14:19.767464Z", |
329 |
| - "start_time": "2021-11-09T14:14:19.730872Z" |
| 450 | + "end_time": "2022-04-01T22:36:24.483454Z", |
| 451 | + "start_time": "2022-04-01T22:36:24.426837Z" |
330 | 452 | }
|
331 | 453 | },
|
332 | 454 | "outputs": [
|
|
336 | 458 | "149.967152078"
|
337 | 459 | ]
|
338 | 460 | },
|
339 |
| - "execution_count": 10, |
| 461 | + "execution_count": 22, |
340 | 462 | "metadata": {},
|
341 | 463 | "output_type": "execute_result"
|
342 | 464 | }
|
|
347 | 469 | },
|
348 | 470 | {
|
349 | 471 | "cell_type": "code",
|
350 |
| - "execution_count": 11, |
| 472 | + "execution_count": 23, |
351 | 473 | "id": "5c1470a5",
|
352 | 474 | "metadata": {
|
353 | 475 | "ExecuteTime": {
|
354 |
| - "end_time": "2021-11-09T14:14:19.865413Z", |
355 |
| - "start_time": "2021-11-09T14:14:19.768843Z" |
| 476 | + "end_time": "2022-04-01T22:36:25.084414Z", |
| 477 | + "start_time": "2022-04-01T22:36:25.031836Z" |
356 | 478 | },
|
357 | 479 | "scrolled": true
|
358 | 480 | },
|
|
363 | 485 | "875.484502657"
|
364 | 486 | ]
|
365 | 487 | },
|
366 |
| - "execution_count": 11, |
| 488 | + "execution_count": 23, |
367 | 489 | "metadata": {},
|
368 | 490 | "output_type": "execute_result"
|
369 | 491 | }
|
|
374 | 496 | },
|
375 | 497 | {
|
376 | 498 | "cell_type": "code",
|
377 |
| - "execution_count": 12, |
| 499 | + "execution_count": 24, |
378 | 500 | "id": "22ce3594",
|
379 | 501 | "metadata": {
|
380 | 502 | "ExecuteTime": {
|
381 |
| - "end_time": "2021-11-09T14:14:20.152458Z", |
382 |
| - "start_time": "2021-11-09T14:14:19.869394Z" |
| 503 | + "end_time": "2022-04-01T22:36:29.136227Z", |
| 504 | + "start_time": "2022-04-01T22:36:28.682322Z" |
383 | 505 | },
|
384 | 506 | "scrolled": true
|
385 | 507 | },
|
|
392 | 514 | "Pufferfish compression: 0.3\n",
|
393 | 515 | "BLAST compression: 3.1\n",
|
394 | 516 | "gzip -9 compression: 5.8\n",
|
395 |
| - "mtg compression: 6.2\n", |
| 517 | + "mtg (single succ.) compression: 6.2\n", |
| 518 | + "mtg (multiple succ.) compression: 7.1\n", |
396 | 519 | "Spring compression: 18.0\n"
|
397 | 520 | ]
|
398 | 521 | }
|
|
402 | 525 | "print(f\"Pufferfish compression: {sum(num_bp) / sum(pfish_size):.1f}\")\n",
|
403 | 526 | "print(f\"BLAST compression: {sum(num_bp) / sum(blast_size):.1f}\")\n",
|
404 | 527 | "print(f\"gzip -9 compression: {sum(num_bp) / sum(gzip_size):.1f}\")\n",
|
405 |
| - "print(f\"mtg compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n", |
| 528 | + "print(f\"mtg (single succ.) compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n", |
| 529 | + "print(f\"mtg (multiple succ.) compression: {sum(num_bp[mtg2_size > 0]) / sum(mtg2_size[mtg2_size > 0]):.1f}\")\n", |
406 | 530 | "print(f\"Spring compression: {sum(num_bp) / sum(spring_size):.1f}\")"
|
407 | 531 | ]
|
408 | 532 | },
|
409 | 533 | {
|
410 | 534 | "cell_type": "code",
|
411 |
| - "execution_count": 13, |
| 535 | + "execution_count": 25, |
412 | 536 | "id": "c129057a",
|
413 | 537 | "metadata": {
|
414 | 538 | "ExecuteTime": {
|
415 |
| - "end_time": "2021-11-09T14:14:20.435271Z", |
416 |
| - "start_time": "2021-11-09T14:14:20.154822Z" |
| 539 | + "end_time": "2022-04-01T22:36:30.071763Z", |
| 540 | + "start_time": "2022-04-01T22:36:29.713198Z" |
417 | 541 | },
|
418 | 542 | "scrolled": true
|
419 | 543 | },
|
|
426 | 550 | "Pufferfish bits/bp: 25.113\n",
|
427 | 551 | "BLAST bits/bp: 2.580\n",
|
428 | 552 | "gzip -9 bits/bp: 1.370\n",
|
429 |
| - "mtg bits/bp: 1.300\n", |
| 553 | + "mtg (single succ.) bits/bp: 1.300\n", |
| 554 | + "mtg (multiple succ.) bits/bp: 1.130\n", |
430 | 555 | "Spring bits/bp: 0.445\n"
|
431 | 556 | ]
|
432 | 557 | }
|
|
436 | 561 | "print(f\"Pufferfish bits/bp: {sum(pfish_size) * 8 / sum(num_bp):.3f}\")\n",
|
437 | 562 | "print(f\"BLAST bits/bp: {sum(blast_size) * 8 / sum(num_bp):.3f}\")\n",
|
438 | 563 | "print(f\"gzip -9 bits/bp: {sum(gzip_size) * 8 / sum(num_bp):.3f}\")\n",
|
439 |
| - "print(f\"mtg bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n", |
| 564 | + "print(f\"mtg (single succ.) bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n", |
| 565 | + "print(f\"mtg (multiple succ.) bits/bp: {sum(mtg2_size[mtg2_size > 0]) * 8 / sum(num_bp[mtg2_size > 0]):.3f}\")\n", |
440 | 566 | "print(f\"Spring bits/bp: {sum(spring_size) * 8 / sum(num_bp):.3f}\")"
|
441 | 567 | ]
|
442 | 568 | },
|
443 | 569 | {
|
444 | 570 | "cell_type": "code",
|
445 |
| - "execution_count": 14, |
| 571 | + "execution_count": 26, |
446 | 572 | "id": "6807effa",
|
447 | 573 | "metadata": {
|
448 | 574 | "ExecuteTime": {
|
449 |
| - "end_time": "2021-11-09T14:14:20.458287Z", |
450 |
| - "start_time": "2021-11-09T14:14:20.439209Z" |
| 575 | + "end_time": "2022-04-01T22:36:31.610803Z", |
| 576 | + "start_time": "2022-04-01T22:36:31.582740Z" |
451 | 577 | }
|
452 | 578 | },
|
453 | 579 | "outputs": [
|
|
457 | 583 | "152884"
|
458 | 584 | ]
|
459 | 585 | },
|
460 |
| - "execution_count": 14, |
| 586 | + "execution_count": 26, |
461 | 587 | "metadata": {},
|
462 | 588 | "output_type": "execute_result"
|
463 | 589 | }
|
|
485 | 611 | },
|
486 | 612 | {
|
487 | 613 | "cell_type": "code",
|
488 |
| - "execution_count": 15, |
| 614 | + "execution_count": 27, |
489 | 615 | "id": "03641724",
|
490 | 616 | "metadata": {
|
491 | 617 | "ExecuteTime": {
|
492 |
| - "end_time": "2021-11-09T14:14:22.748943Z", |
493 |
| - "start_time": "2021-11-09T14:14:20.459756Z" |
| 618 | + "end_time": "2022-04-01T22:36:38.926734Z", |
| 619 | + "start_time": "2022-04-01T22:36:35.666505Z" |
494 | 620 | }
|
495 | 621 | },
|
496 | 622 | "outputs": [
|
|
536 | 662 | },
|
537 | 663 | {
|
538 | 664 | "cell_type": "code",
|
539 |
| - "execution_count": 16, |
| 665 | + "execution_count": 28, |
540 | 666 | "id": "255041f2",
|
541 | 667 | "metadata": {
|
542 | 668 | "ExecuteTime": {
|
543 |
| - "end_time": "2021-11-09T14:14:23.190372Z", |
544 |
| - "start_time": "2021-11-09T14:14:22.752090Z" |
| 669 | + "end_time": "2022-04-01T22:36:38.951695Z", |
| 670 | + "start_time": "2022-04-01T22:36:38.929720Z" |
545 | 671 | }
|
546 | 672 | },
|
547 | 673 | "outputs": [
|
|
559 | 685 | },
|
560 | 686 | {
|
561 | 687 | "cell_type": "code",
|
562 |
| - "execution_count": 17, |
| 688 | + "execution_count": 29, |
563 | 689 | "id": "ff17867c",
|
564 | 690 | "metadata": {
|
565 | 691 | "ExecuteTime": {
|
566 |
| - "end_time": "2021-11-09T14:14:23.501441Z", |
567 |
| - "start_time": "2021-11-09T14:14:23.194795Z" |
| 692 | + "end_time": "2022-04-01T22:36:39.247516Z", |
| 693 | + "start_time": "2022-04-01T22:36:38.956001Z" |
568 | 694 | }
|
569 | 695 | },
|
570 | 696 | "outputs": [
|
|
574 | 700 | "152418"
|
575 | 701 | ]
|
576 | 702 | },
|
577 |
| - "execution_count": 17, |
| 703 | + "execution_count": 29, |
578 | 704 | "metadata": {},
|
579 | 705 | "output_type": "execute_result"
|
580 | 706 | }
|
|
585 | 711 | },
|
586 | 712 | {
|
587 | 713 | "cell_type": "code",
|
588 |
| - "execution_count": 18, |
| 714 | + "execution_count": 30, |
589 | 715 | "id": "c8d921fe",
|
590 | 716 | "metadata": {
|
591 | 717 | "ExecuteTime": {
|
592 |
| - "end_time": "2021-11-09T14:14:23.569444Z", |
593 |
| - "start_time": "2021-11-09T14:14:23.505170Z" |
| 718 | + "end_time": "2022-04-01T22:36:39.320067Z", |
| 719 | + "start_time": "2022-04-01T22:36:39.251631Z" |
594 | 720 | }
|
595 | 721 | },
|
596 | 722 | "outputs": [
|
|
600 | 726 | "38.4434618997873"
|
601 | 727 | ]
|
602 | 728 | },
|
603 |
| - "execution_count": 18, |
| 729 | + "execution_count": 30, |
604 | 730 | "metadata": {},
|
605 | 731 | "output_type": "execute_result"
|
606 | 732 | }
|
|
619 | 745 | },
|
620 | 746 | {
|
621 | 747 | "cell_type": "code",
|
622 |
| - "execution_count": 19, |
| 748 | + "execution_count": 31, |
623 | 749 | "id": "822619af",
|
624 | 750 | "metadata": {
|
625 | 751 | "ExecuteTime": {
|
626 |
| - "end_time": "2021-11-09T14:14:23.696575Z", |
627 |
| - "start_time": "2021-11-09T14:14:23.570820Z" |
| 752 | + "end_time": "2022-04-01T22:36:39.457479Z", |
| 753 | + "start_time": "2022-04-01T22:36:39.326006Z" |
628 | 754 | }
|
629 | 755 | },
|
630 | 756 | "outputs": [
|
|
1107 | 1233 | },
|
1108 | 1234 | {
|
1109 | 1235 | "cell_type": "code",
|
1110 |
| - "execution_count": 20, |
| 1236 | + "execution_count": 32, |
1111 | 1237 | "id": "5ed9f0c7",
|
1112 | 1238 | "metadata": {
|
1113 | 1239 | "ExecuteTime": {
|
1114 |
| - "end_time": "2021-11-09T14:14:23.858803Z", |
1115 |
| - "start_time": "2021-11-09T14:14:23.698367Z" |
| 1240 | + "end_time": "2022-04-01T22:36:39.675712Z", |
| 1241 | + "start_time": "2022-04-01T22:36:39.461357Z" |
1116 | 1242 | }
|
1117 | 1243 | },
|
1118 | 1244 | "outputs": [],
|
|
1124 | 1250 | "spring_size = spring_size[idx]\n",
|
1125 | 1251 | "pfish_size = pfish_size[idx]\n",
|
1126 | 1252 | "mtg_size = mtg_size[idx]\n",
|
| 1253 | + "mtg_size_graph = mtg_size_graph[idx]\n", |
| 1254 | + "mtg_size_anno = mtg_size_anno[idx]\n", |
| 1255 | + "mtg2_size = mtg2_size[idx]\n", |
1127 | 1256 | "blast_size = blast_size[idx]\n",
|
1128 | 1257 | "mgblst_size = mgblst_size[idx]"
|
1129 | 1258 | ]
|
1130 | 1259 | },
|
1131 | 1260 | {
|
1132 | 1261 | "cell_type": "code",
|
1133 |
| - "execution_count": 21, |
| 1262 | + "execution_count": 33, |
1134 | 1263 | "id": "30526982",
|
1135 | 1264 | "metadata": {
|
1136 | 1265 | "ExecuteTime": {
|
1137 |
| - "end_time": "2021-11-09T14:14:23.900052Z", |
1138 |
| - "start_time": "2021-11-09T14:14:23.860557Z" |
| 1266 | + "end_time": "2022-04-01T22:36:40.670070Z", |
| 1267 | + "start_time": "2022-04-01T22:36:40.620378Z" |
1139 | 1268 | }
|
1140 | 1269 | },
|
1141 | 1270 | "outputs": [
|
|
1145 | 1274 | "111.599186766"
|
1146 | 1275 | ]
|
1147 | 1276 | },
|
1148 |
| - "execution_count": 21, |
| 1277 | + "execution_count": 33, |
1149 | 1278 | "metadata": {},
|
1150 | 1279 | "output_type": "execute_result"
|
1151 | 1280 | }
|
|
1156 | 1285 | },
|
1157 | 1286 | {
|
1158 | 1287 | "cell_type": "code",
|
1159 |
| - "execution_count": 22, |
| 1288 | + "execution_count": 34, |
1160 | 1289 | "id": "cc211f0c",
|
1161 | 1290 | "metadata": {
|
1162 | 1291 | "ExecuteTime": {
|
1163 |
| - "end_time": "2021-11-09T14:14:23.938646Z", |
1164 |
| - "start_time": "2021-11-09T14:14:23.903446Z" |
| 1292 | + "end_time": "2022-04-01T22:36:42.411850Z", |
| 1293 | + "start_time": "2022-04-01T22:36:42.360430Z" |
1165 | 1294 | },
|
1166 | 1295 | "scrolled": true
|
1167 | 1296 | },
|
|
1172 | 1301 | "716.726703751"
|
1173 | 1302 | ]
|
1174 | 1303 | },
|
1175 |
| - "execution_count": 22, |
| 1304 | + "execution_count": 34, |
1176 | 1305 | "metadata": {},
|
1177 | 1306 | "output_type": "execute_result"
|
1178 | 1307 | }
|
|
1183 | 1312 | },
|
1184 | 1313 | {
|
1185 | 1314 | "cell_type": "code",
|
1186 |
| - "execution_count": 23, |
| 1315 | + "execution_count": 35, |
1187 | 1316 | "id": "997fc48e",
|
1188 | 1317 | "metadata": {
|
1189 | 1318 | "ExecuteTime": {
|
1190 |
| - "end_time": "2021-11-09T14:14:24.215738Z", |
1191 |
| - "start_time": "2021-11-09T14:14:23.940174Z" |
| 1319 | + "end_time": "2022-04-01T22:36:46.216869Z", |
| 1320 | + "start_time": "2022-04-01T22:36:45.876951Z" |
1192 | 1321 | },
|
1193 | 1322 | "scrolled": true
|
1194 | 1323 | },
|
|
1201 | 1330 | "Pufferfish compression: 0.4\n",
|
1202 | 1331 | "BLAST compression: 3.0\n",
|
1203 | 1332 | "gzip -9 compression: 6.4\n",
|
1204 |
| - "mtg compression: 14.7\n", |
| 1333 | + "mtg (single succ.) compression: 14.7\n", |
| 1334 | + "mtg (multiple succ.) compression: nan\n", |
1205 | 1335 | "Spring compression: 38.4\n"
|
1206 | 1336 | ]
|
1207 | 1337 | }
|
|
1211 | 1341 | "print(f\"Pufferfish compression: {sum(num_bp) / sum(pfish_size):.1f}\")\n",
|
1212 | 1342 | "print(f\"BLAST compression: {sum(num_bp) / sum(blast_size):.1f}\")\n",
|
1213 | 1343 | "print(f\"gzip -9 compression: {sum(num_bp) / sum(gzip_size):.1f}\")\n",
|
1214 |
| - "print(f\"mtg compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n", |
| 1344 | + "print(f\"mtg (single succ.) compression: {sum(num_bp) / sum(mtg_size):.1f}\")\n", |
| 1345 | + "print(f\"mtg (multiple succ.) compression: {sum(num_bp) / sum(mtg2_size):.1f}\")\n", |
1215 | 1346 | "print(f\"Spring compression: {sum(num_bp) / sum(spring_size):.1f}\")"
|
1216 | 1347 | ]
|
1217 | 1348 | },
|
1218 | 1349 | {
|
1219 | 1350 | "cell_type": "code",
|
1220 |
| - "execution_count": 24, |
| 1351 | + "execution_count": 36, |
1221 | 1352 | "id": "880c0ce0",
|
1222 | 1353 | "metadata": {
|
1223 | 1354 | "ExecuteTime": {
|
1224 |
| - "end_time": "2021-11-09T14:14:24.504474Z", |
1225 |
| - "start_time": "2021-11-09T14:14:24.217362Z" |
| 1355 | + "end_time": "2022-04-01T22:36:46.823554Z", |
| 1356 | + "start_time": "2022-04-01T22:36:46.488602Z" |
1226 | 1357 | },
|
1227 | 1358 | "scrolled": true
|
1228 | 1359 | },
|
|
1235 | 1366 | "Pufferfish bits/bp: 21.816\n",
|
1236 | 1367 | "BLAST bits/bp: 2.678\n",
|
1237 | 1368 | "gzip -9 bits/bp: 1.246\n",
|
1238 |
| - "mtg bits/bp: 0.544\n", |
| 1369 | + "mtg (single succ.) bits/bp: 0.544\n", |
| 1370 | + "mtg (multiple succ.) bits/bp: nan\n", |
1239 | 1371 | "Spring bits/bp: 0.208\n"
|
1240 | 1372 | ]
|
1241 | 1373 | }
|
|
1245 | 1377 | "print(f\"Pufferfish bits/bp: {sum(pfish_size) * 8 / sum(num_bp):.3f}\")\n",
|
1246 | 1378 | "print(f\"BLAST bits/bp: {sum(blast_size) * 8 / sum(num_bp):.3f}\")\n",
|
1247 | 1379 | "print(f\"gzip -9 bits/bp: {sum(gzip_size) * 8 / sum(num_bp):.3f}\")\n",
|
1248 |
| - "print(f\"mtg bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n", |
| 1380 | + "print(f\"mtg (single succ.) bits/bp: {sum(mtg_size) * 8 / sum(num_bp):.3f}\")\n", |
| 1381 | + "print(f\"mtg (multiple succ.) bits/bp: {sum(mtg2_size) * 8 / sum(num_bp):.3f}\")\n", |
1249 | 1382 | "print(f\"Spring bits/bp: {sum(spring_size) * 8 / sum(num_bp):.3f}\")"
|
1250 | 1383 | ]
|
1251 | 1384 | },
|
| 1385 | + { |
| 1386 | + "cell_type": "code", |
| 1387 | + "execution_count": 37, |
| 1388 | + "id": "fe96ab33", |
| 1389 | + "metadata": { |
| 1390 | + "ExecuteTime": { |
| 1391 | + "end_time": "2022-04-01T22:36:48.428809Z", |
| 1392 | + "start_time": "2022-04-01T22:36:48.312386Z" |
| 1393 | + } |
| 1394 | + }, |
| 1395 | + "outputs": [ |
| 1396 | + { |
| 1397 | + "name": "stdout", |
| 1398 | + "output_type": "stream", |
| 1399 | + "text": [ |
| 1400 | + "mtg (single succ.), graph only, bits/bp: 0.151\n", |
| 1401 | + "mtg (single succ.), anno only, bits/bp: 0.393\n" |
| 1402 | + ] |
| 1403 | + } |
| 1404 | + ], |
| 1405 | + "source": [ |
| 1406 | + "print(f\"mtg (single succ.), graph only, bits/bp: {sum(mtg_size_graph) * 8 / sum(num_bp):.3f}\")\n", |
| 1407 | + "print(f\"mtg (single succ.), anno only, bits/bp: {sum(mtg_size_anno) * 8 / sum(num_bp):.3f}\")" |
| 1408 | + ] |
| 1409 | + }, |
1252 | 1410 | {
|
1253 | 1411 | "cell_type": "code",
|
1254 | 1412 | "execution_count": 25,
|
|
1468 | 1626 | "\n",
|
1469 | 1627 | "violin_data = [\n",
|
1470 | 1628 | " 8 * spring_size / num_bp,\n",
|
| 1629 | + " 8 * mtg2_size / num_bp,\n", |
1471 | 1630 | " 8 * mtg_size / num_bp,\n",
|
1472 | 1631 | " 8 * gzip_size / num_bp,\n",
|
1473 | 1632 | " 8 * blast_size / num_bp,\n",
|
1474 | 1633 | "]\n",
|
1475 | 1634 | "labels = ['Spring (*)',\n",
|
1476 |
| - " 'Counting DBG\\nwith coordinates',\n", |
| 1635 | + " 'Counting DBG with coord.\\n(multiple successors)',\n", |
| 1636 | + " 'Counting DBG with coord.\\n(single successor)',\n", |
1477 | 1637 | " 'gzip -9 (*)',\n",
|
1478 | 1638 | " 'BLAST',]\n",
|
1479 | 1639 | "violin_weights = [num_bp] * len(labels)\n",
|
@@ -1623,157 +1783,805 @@
|
1623 | 1783 | },
|
1624 | 1784 | {
|
1625 | 1785 | "cell_type": "markdown",
|
1626 |
| - "id": "03209b5d", |
| 1786 | + "id": "feb55e84", |
1627 | 1787 | "metadata": {},
|
1628 | 1788 | "source": [
|
1629 |
| - "## Illumina RNA-Seq reads" |
| 1789 | + "#### Query time" |
1630 | 1790 | ]
|
1631 | 1791 | },
|
1632 | 1792 | {
|
1633 | 1793 | "cell_type": "code",
|
1634 |
| - "execution_count": 30, |
1635 |
| - "id": "3d872be3", |
| 1794 | + "execution_count": 47, |
| 1795 | + "id": "98242b99", |
1636 | 1796 | "metadata": {
|
1637 | 1797 | "ExecuteTime": {
|
1638 |
| - "end_time": "2021-11-09T14:14:37.136911Z", |
1639 |
| - "start_time": "2021-11-09T14:14:37.108300Z" |
| 1798 | + "end_time": "2022-04-22T13:36:54.737797Z", |
| 1799 | + "start_time": "2022-04-22T13:36:54.714619Z" |
1640 | 1800 | }
|
1641 | 1801 | },
|
1642 | 1802 | "outputs": [],
|
1643 | 1803 | "source": [
|
1644 |
| - "with open('kingsford_gzip_spring.pkl', 'rb') as f:\n", |
1645 |
| - " gzip, read_length, num_bp, gzip_size, fasta_size, spring_size = pickle.load(f)\n", |
1646 |
| - "\n", |
1647 |
| - "# gzip = !ls ~/metagenome/data/kingsford/compressed/*_no_header.fasta.gz\n", |
1648 |
| - "\n", |
1649 |
| - "# num_bp = []\n", |
1650 |
| - "# read_length = []\n", |
1651 |
| - "# spring_size = []\n", |
1652 |
| - "# gzip_size = []\n", |
1653 |
| - "# fasta_size = []\n", |
1654 |
| - "\n", |
1655 |
| - "# for x in tqdm(gzip, mininterval=1):\n", |
1656 |
| - "# gzip_size.append(get_size(x))\n", |
1657 |
| - "# fasta_size.append(get_size(x[:-len('.gz')]))\n", |
1658 |
| - "# ID = x[:-len('_no_header.fasta.gz')]\n", |
1659 |
| - "# spring_fname = f'{ID}_no_header.spring'\n", |
1660 |
| - "# spring_size.append(get_size(spring_fname))\n", |
1661 |
| - "# SRA = ID.split('/')[-1]\n", |
1662 |
| - "# fname = x[:-len('_no_header.fasta.gz')] + '.num_bp'\n", |
1663 |
| - "# with open(fname, 'r') as f:\n", |
1664 |
| - "# n = int(f.readline())\n", |
1665 |
| - "# num_bp.append(n)\n", |
1666 |
| - "\n", |
1667 |
| - "# with gz.open(x, 'r') as f:\n", |
1668 |
| - "# f.readline().strip().decode()\n", |
1669 |
| - "# seq = f.readline().strip().decode()\n", |
1670 |
| - "# L = len(seq)\n", |
1671 |
| - "# all_same_len = True\n", |
1672 |
| - "# for i in range(1000): # check 1000 first reads\n", |
1673 |
| - "# f.readline().strip().decode()\n", |
1674 |
| - "# if L != len(f.readline().strip().decode()):\n", |
1675 |
| - "# all_same_len = False\n", |
1676 |
| - "# break\n", |
1677 |
| - "# read_length.append(L if all_same_len and num_bp[-1] % L == 0 else np.nan)\n", |
1678 |
| - "\n", |
1679 |
| - "# gzip = np.array(gzip)\n", |
1680 |
| - "# num_bp = np.array(num_bp)\n", |
1681 |
| - "# spring_size = np.array(spring_size)\n", |
1682 |
| - "# gzip_size = np.array(gzip_size)\n", |
1683 |
| - "# fasta_size = np.array(fasta_size)\n", |
1684 |
| - "# read_length = np.array(read_length)\n", |
1685 |
| - "\n", |
1686 |
| - "# with open('kingsford_gzip_spring.pkl', 'wb') as f:\n", |
1687 |
| - "# pickle.dump((gzip, read_length, num_bp, gzip_size, fasta_size, spring_size), f)" |
| 1804 | + "import pandas as pd" |
1688 | 1805 | ]
|
1689 | 1806 | },
|
1690 | 1807 | {
|
1691 | 1808 | "cell_type": "code",
|
1692 |
| - "execution_count": 31, |
1693 |
| - "id": "4ffcdd65", |
| 1809 | + "execution_count": 200, |
| 1810 | + "id": "1d6fd607", |
1694 | 1811 | "metadata": {
|
1695 | 1812 | "ExecuteTime": {
|
1696 |
| - "end_time": "2021-11-09T14:14:37.162442Z", |
1697 |
| - "start_time": "2021-11-09T14:14:37.140570Z" |
| 1813 | + "end_time": "2022-04-26T13:10:41.760762Z", |
| 1814 | + "start_time": "2022-04-26T13:09:19.866872Z" |
1698 | 1815 | },
|
1699 | 1816 | "scrolled": false
|
1700 | 1817 | },
|
1701 | 1818 | "outputs": [
|
1702 | 1819 | {
|
1703 |
| - "data": { |
1704 |
| - "text/plain": [ |
1705 |
| - "(2652, 7.973392066664)" |
1706 |
| - ] |
1707 |
| - }, |
1708 |
| - "execution_count": 31, |
1709 |
| - "metadata": {}, |
1710 |
| - "output_type": "execute_result" |
| 1820 | + "name": "stdout", |
| 1821 | + "output_type": "stream", |
| 1822 | + "text": [ |
| 1823 | + "0.09, 0.51\n", |
| 1824 | + "0.08, 2.06\n", |
| 1825 | + "0.08, 0.11\n", |
| 1826 | + "0.15, 0.41\n" |
| 1827 | + ] |
1711 | 1828 | }
|
1712 | 1829 | ],
|
1713 | 1830 | "source": [
|
1714 |
| - "len(num_bp), sum(num_bp) / 1e12" |
| 1831 | + "def show_time(times_file):\n", |
| 1832 | + " df = pd.read_csv(times_file, header=None, sep=' ', names=['id', 'time'], index_col=0)\n", |
| 1833 | + " for i in range(df.shape[0]):\n", |
| 1834 | + " try:\n", |
| 1835 | + " df.iloc[i].time = to_seconds(df.iloc[i].time)\n", |
| 1836 | + " except:\n", |
| 1837 | + " print(df.iloc[i])\n", |
| 1838 | + " df.iloc[i].time = 0\n", |
| 1839 | + " print(f'{df.time.mean():.2f}, {df.time.std():.2f}')\n", |
| 1840 | + "\n", |
| 1841 | + "#print(show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_query.times'))\n", |
| 1842 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/megablast/delta_variants_query.times')\n", |
| 1843 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/pufferfish_sparse/delta_variants_align.times')\n", |
| 1844 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/blast/delta_variants_query.times')\n", |
| 1845 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align.times')" |
1715 | 1846 | ]
|
1716 | 1847 | },
|
1717 | 1848 | {
|
1718 | 1849 | "cell_type": "code",
|
1719 |
| - "execution_count": 32, |
1720 |
| - "id": "cae57556", |
| 1850 | + "execution_count": 202, |
| 1851 | + "id": "31c0f6a7", |
| 1852 | + "metadata": { |
| 1853 | + "ExecuteTime": { |
| 1854 | + "end_time": "2022-04-26T13:16:29.246970Z", |
| 1855 | + "start_time": "2022-04-26T13:15:07.360418Z" |
| 1856 | + } |
| 1857 | + }, |
| 1858 | + "outputs": [ |
| 1859 | + { |
| 1860 | + "name": "stdout", |
| 1861 | + "output_type": "stream", |
| 1862 | + "text": [ |
| 1863 | + "0.09, 0.49\n", |
| 1864 | + "0.06, 1.06\n", |
| 1865 | + "0.08, 0.10\n", |
| 1866 | + "0.15, 0.40\n" |
| 1867 | + ] |
| 1868 | + } |
| 1869 | + ], |
| 1870 | + "source": [ |
| 1871 | + "def show_time(times_file):\n", |
| 1872 | + " df = pd.read_csv(times_file, header=None, sep=' ', names=['id', 'time'], index_col=0)\n", |
| 1873 | + " df = df.loc[np.array([x.split('/')[-1][:-len('_no_header.fasta.gz')] for x in gzip])]\n", |
| 1874 | + " for i in range(df.shape[0]):\n", |
| 1875 | + " try:\n", |
| 1876 | + " df.iloc[i].time = to_seconds(df.iloc[i].time)\n", |
| 1877 | + " except:\n", |
| 1878 | + " print(df.iloc[i])\n", |
| 1879 | + " df.iloc[i].time = 0\n", |
| 1880 | + " print(f'{df.time.mean():.2f}, {df.time.std():.2f}')\n", |
| 1881 | + "\n", |
| 1882 | + "#show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_query.times')\n", |
| 1883 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/megablast/delta_variants_query.times')\n", |
| 1884 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/pufferfish_sparse/delta_variants_align.times')\n", |
| 1885 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/blast/delta_variants_query.times')\n", |
| 1886 | + "show_time('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align.times')" |
| 1887 | + ] |
| 1888 | + }, |
| 1889 | + { |
| 1890 | + "cell_type": "code", |
| 1891 | + "execution_count": 61, |
| 1892 | + "id": "dc2bfb02", |
1721 | 1893 | "metadata": {
|
1722 | 1894 | "ExecuteTime": {
|
1723 |
| - "end_time": "2021-11-09T14:14:37.183886Z", |
1724 |
| - "start_time": "2021-11-09T14:14:37.166181Z" |
| 1895 | + "end_time": "2022-04-22T14:33:47.588583Z", |
| 1896 | + "start_time": "2022-04-22T14:33:47.431837Z" |
1725 | 1897 | }
|
1726 | 1898 | },
|
1727 | 1899 | "outputs": [],
|
1728 | 1900 | "source": [
|
1729 |
| - "with open('kingsford_mtg_size.pkl', 'rb') as f:\n", |
1730 |
| - " mtg_size = pickle.load(f)\n", |
1731 |
| - "\n", |
1732 |
| - "# mtg_size = []\n", |
1733 |
| - "\n", |
1734 |
| - "# DIR = !echo $HOME\n", |
1735 |
| - "# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates/'\n", |
1736 |
| - "\n", |
1737 |
| - "# for x in tqdm(gzip, mininterval=1):\n", |
1738 |
| - "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
1739 |
| - "# graph_size = get_size(f'{DIR}/{ID}.fasta.gz/graph_small.dbg')\n", |
1740 |
| - "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n", |
1741 |
| - "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n", |
1742 |
| - "# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n", |
1743 |
| - "# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n", |
1744 |
| - "# mtg_size.append(graph_size + get_size(column_fname) + get_size(coords_fname))\n", |
1745 |
| - "\n", |
1746 |
| - "# mtg_size = np.array(mtg_size)\n", |
1747 |
| - "\n", |
1748 |
| - "# with open('kingsford_mtg_size.pkl', 'wb') as f:\n", |
1749 |
| - "# pickle.dump(mtg_size, f)" |
| 1901 | + "df = pd.read_csv('~/metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg/delta_variants_align_chain_optim.times', header=None, sep=' ', names=['id', 'time'], index_col=0)" |
1750 | 1902 | ]
|
1751 | 1903 | },
|
1752 | 1904 | {
|
1753 | 1905 | "cell_type": "code",
|
1754 |
| - "execution_count": 33, |
1755 |
| - "id": "09188d87", |
| 1906 | + "execution_count": 66, |
| 1907 | + "id": "92903856", |
1756 | 1908 | "metadata": {
|
1757 | 1909 | "ExecuteTime": {
|
1758 |
| - "end_time": "2021-11-09T14:14:37.201542Z", |
1759 |
| - "start_time": "2021-11-09T14:14:37.185480Z" |
1760 |
| - }, |
1761 |
| - "scrolled": true |
| 1910 | + "end_time": "2022-04-22T14:34:52.219848Z", |
| 1911 | + "start_time": "2022-04-22T14:34:32.462326Z" |
| 1912 | + } |
1762 | 1913 | },
|
1763 | 1914 | "outputs": [],
|
1764 | 1915 | "source": [
|
1765 |
| - "with open('kingsford_mtg_2_size.pkl', 'rb') as f:\n", |
1766 |
| - " mtg2_size = pickle.load(f)\n", |
1767 |
| - "\n", |
1768 |
| - "# mtg2_size = []\n", |
1769 |
| - "\n", |
1770 |
| - "# DIR = !echo $HOME\n", |
1771 |
| - "# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates_fork_opt/'\n", |
1772 |
| - "\n", |
1773 |
| - "# for x in tqdm(gzip, mininterval=1):\n", |
1774 |
| - "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
1775 |
| - "# graph_size = get_size(f'{DIR}../kingsford_31_coordinates/{ID}.fasta.gz/graph_small.dbg')\n", |
1776 |
| - "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n", |
| 1916 | + "for i in range(df.shape[0]):\n", |
| 1917 | + " df.iloc[i].time = to_seconds(df.iloc[i].time)" |
| 1918 | + ] |
| 1919 | + }, |
| 1920 | + { |
| 1921 | + "cell_type": "code", |
| 1922 | + "execution_count": 69, |
| 1923 | + "id": "a691b9a5", |
| 1924 | + "metadata": { |
| 1925 | + "ExecuteTime": { |
| 1926 | + "end_time": "2022-04-22T14:35:05.536232Z", |
| 1927 | + "start_time": "2022-04-22T14:35:05.499065Z" |
| 1928 | + } |
| 1929 | + }, |
| 1930 | + "outputs": [ |
| 1931 | + { |
| 1932 | + "data": { |
| 1933 | + "text/html": [ |
| 1934 | + "<div>\n", |
| 1935 | + "<style scoped>\n", |
| 1936 | + " .dataframe tbody tr th:only-of-type {\n", |
| 1937 | + " vertical-align: middle;\n", |
| 1938 | + " }\n", |
| 1939 | + "\n", |
| 1940 | + " .dataframe tbody tr th {\n", |
| 1941 | + " vertical-align: top;\n", |
| 1942 | + " }\n", |
| 1943 | + "\n", |
| 1944 | + " .dataframe thead th {\n", |
| 1945 | + " text-align: right;\n", |
| 1946 | + " }\n", |
| 1947 | + "</style>\n", |
| 1948 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 1949 | + " <thead>\n", |
| 1950 | + " <tr style=\"text-align: right;\">\n", |
| 1951 | + " <th></th>\n", |
| 1952 | + " <th>time</th>\n", |
| 1953 | + " </tr>\n", |
| 1954 | + " <tr>\n", |
| 1955 | + " <th>id</th>\n", |
| 1956 | + " <th></th>\n", |
| 1957 | + " </tr>\n", |
| 1958 | + " </thead>\n", |
| 1959 | + " <tbody>\n", |
| 1960 | + " <tr>\n", |
| 1961 | + " <th>SRR13144527</th>\n", |
| 1962 | + " <td>2133.2</td>\n", |
| 1963 | + " </tr>\n", |
| 1964 | + " </tbody>\n", |
| 1965 | + "</table>\n", |
| 1966 | + "</div>" |
| 1967 | + ], |
| 1968 | + "text/plain": [ |
| 1969 | + " time\n", |
| 1970 | + "id \n", |
| 1971 | + "SRR13144527 2133.2" |
| 1972 | + ] |
| 1973 | + }, |
| 1974 | + "execution_count": 69, |
| 1975 | + "metadata": {}, |
| 1976 | + "output_type": "execute_result" |
| 1977 | + } |
| 1978 | + ], |
| 1979 | + "source": [ |
| 1980 | + "df[df.time == 2133.2]" |
| 1981 | + ] |
| 1982 | + }, |
| 1983 | + { |
| 1984 | + "cell_type": "markdown", |
| 1985 | + "id": "0720b2d6", |
| 1986 | + "metadata": {}, |
| 1987 | + "source": [ |
| 1988 | + "### Align to RefSeq" |
| 1989 | + ] |
| 1990 | + }, |
| 1991 | + { |
| 1992 | + "cell_type": "markdown", |
| 1993 | + "id": "06c11e73", |
| 1994 | + "metadata": {}, |
| 1995 | + "source": [ |
| 1996 | + "#### Fungi" |
| 1997 | + ] |
| 1998 | + }, |
| 1999 | + { |
| 2000 | + "cell_type": "markdown", |
| 2001 | + "id": "2da4c496", |
| 2002 | + "metadata": {}, |
| 2003 | + "source": [ |
| 2004 | + "BLAST" |
| 2005 | + ] |
| 2006 | + }, |
| 2007 | + { |
| 2008 | + "cell_type": "code", |
| 2009 | + "execution_count": 122, |
| 2010 | + "id": "94d6597f", |
| 2011 | + "metadata": { |
| 2012 | + "ExecuteTime": { |
| 2013 | + "end_time": "2022-04-24T16:03:53.380371Z", |
| 2014 | + "start_time": "2022-04-24T16:03:53.286701Z" |
| 2015 | + } |
| 2016 | + }, |
| 2017 | + "outputs": [ |
| 2018 | + { |
| 2019 | + "name": "stdout", |
| 2020 | + "output_type": "stream", |
| 2021 | + "text": [ |
| 2022 | + "Runs: 100\n", |
| 2023 | + "2.07 sec\n", |
| 2024 | + "2.175 GB\n" |
| 2025 | + ] |
| 2026 | + } |
| 2027 | + ], |
| 2028 | + "source": [ |
| 2029 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head100.split.fungi.blast.log'\n", |
| 2030 | + "\n", |
| 2031 | + "lines = !grep 'wall' $FILE\n", |
| 2032 | + "\n", |
| 2033 | + "print('Runs:', len(lines))\n", |
| 2034 | + "print('{:.2f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n", |
| 2035 | + "\n", |
| 2036 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2037 | + "\n", |
| 2038 | + "print('{:.3f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2039 | + ] |
| 2040 | + }, |
| 2041 | + { |
| 2042 | + "cell_type": "markdown", |
| 2043 | + "id": "24ea90f0", |
| 2044 | + "metadata": { |
| 2045 | + "ExecuteTime": { |
| 2046 | + "end_time": "2022-04-24T16:03:29.832878Z", |
| 2047 | + "start_time": "2022-04-24T16:03:29.403798Z" |
| 2048 | + } |
| 2049 | + }, |
| 2050 | + "source": [ |
| 2051 | + "MegaBLAST" |
| 2052 | + ] |
| 2053 | + }, |
| 2054 | + { |
| 2055 | + "cell_type": "code", |
| 2056 | + "execution_count": 123, |
| 2057 | + "id": "2a859bd3", |
| 2058 | + "metadata": { |
| 2059 | + "ExecuteTime": { |
| 2060 | + "end_time": "2022-04-24T16:03:58.523237Z", |
| 2061 | + "start_time": "2022-04-24T16:03:58.441588Z" |
| 2062 | + } |
| 2063 | + }, |
| 2064 | + "outputs": [ |
| 2065 | + { |
| 2066 | + "name": "stdout", |
| 2067 | + "output_type": "stream", |
| 2068 | + "text": [ |
| 2069 | + "Runs: 100\n", |
| 2070 | + "4.31 sec\n", |
| 2071 | + "0.034 GB\n" |
| 2072 | + ] |
| 2073 | + } |
| 2074 | + ], |
| 2075 | + "source": [ |
| 2076 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head100.split.fungi.megablast.log'\n", |
| 2077 | + "\n", |
| 2078 | + "lines = !grep 'wall' $FILE\n", |
| 2079 | + "\n", |
| 2080 | + "print('Runs:', len(lines))\n", |
| 2081 | + "print('{:.2f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n", |
| 2082 | + "\n", |
| 2083 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2084 | + "\n", |
| 2085 | + "print('{:.3f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2086 | + ] |
| 2087 | + }, |
| 2088 | + { |
| 2089 | + "cell_type": "markdown", |
| 2090 | + "id": "65da4202", |
| 2091 | + "metadata": { |
| 2092 | + "ExecuteTime": { |
| 2093 | + "end_time": "2022-04-24T16:05:28.998442Z", |
| 2094 | + "start_time": "2022-04-24T16:05:28.513571Z" |
| 2095 | + } |
| 2096 | + }, |
| 2097 | + "source": [ |
| 2098 | + "MetaGraph" |
| 2099 | + ] |
| 2100 | + }, |
| 2101 | + { |
| 2102 | + "cell_type": "code", |
| 2103 | + "execution_count": 226, |
| 2104 | + "id": "2b8e817e", |
| 2105 | + "metadata": { |
| 2106 | + "ExecuteTime": { |
| 2107 | + "end_time": "2022-04-27T08:36:59.711362Z", |
| 2108 | + "start_time": "2022-04-27T08:36:59.617953Z" |
| 2109 | + } |
| 2110 | + }, |
| 2111 | + "outputs": [ |
| 2112 | + { |
| 2113 | + "name": "stdout", |
| 2114 | + "output_type": "stream", |
| 2115 | + "text": [ |
| 2116 | + "Runs: 100\n", |
| 2117 | + "0.021 sec\n", |
| 2118 | + "3.4 GB\n" |
| 2119 | + ] |
| 2120 | + } |
| 2121 | + ], |
| 2122 | + "source": [ |
| 2123 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head100.split.fungi.align.log'\n", |
| 2124 | + "\n", |
| 2125 | + "lines = !grep 'processed in' $FILE\n", |
| 2126 | + "\n", |
| 2127 | + "print('Runs:', len(lines))\n", |
| 2128 | + "print('{:.3f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n", |
| 2129 | + "\n", |
| 2130 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2131 | + "print('{:.1f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2132 | + ] |
| 2133 | + }, |
| 2134 | + { |
| 2135 | + "cell_type": "code", |
| 2136 | + "execution_count": 228, |
| 2137 | + "id": "13a32580", |
| 2138 | + "metadata": { |
| 2139 | + "ExecuteTime": { |
| 2140 | + "end_time": "2022-04-27T08:37:45.259072Z", |
| 2141 | + "start_time": "2022-04-27T08:37:44.280431Z" |
| 2142 | + } |
| 2143 | + }, |
| 2144 | + "outputs": [ |
| 2145 | + { |
| 2146 | + "name": "stdout", |
| 2147 | + "output_type": "stream", |
| 2148 | + "text": [ |
| 2149 | + "Runs: 1\n", |
| 2150 | + "14.76 sec\n", |
| 2151 | + "3.5 GB\n" |
| 2152 | + ] |
| 2153 | + } |
| 2154 | + ], |
| 2155 | + "source": [ |
| 2156 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head1000.fa.fungi.align.log'\n", |
| 2157 | + "\n", |
| 2158 | + "lines = !grep 'processed in' $FILE\n", |
| 2159 | + "\n", |
| 2160 | + "print('Runs:', len(lines))\n", |
| 2161 | + "print('{:.2f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n", |
| 2162 | + "\n", |
| 2163 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2164 | + "print('{:.1f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2165 | + ] |
| 2166 | + }, |
| 2167 | + { |
| 2168 | + "cell_type": "markdown", |
| 2169 | + "id": "8674e8ec", |
| 2170 | + "metadata": {}, |
| 2171 | + "source": [ |
| 2172 | + "#### All" |
| 2173 | + ] |
| 2174 | + }, |
| 2175 | + { |
| 2176 | + "cell_type": "markdown", |
| 2177 | + "id": "81ef66ec", |
| 2178 | + "metadata": {}, |
| 2179 | + "source": [ |
| 2180 | + "BLAST" |
| 2181 | + ] |
| 2182 | + }, |
| 2183 | + { |
| 2184 | + "cell_type": "code", |
| 2185 | + "execution_count": 198, |
| 2186 | + "id": "26f9f305", |
| 2187 | + "metadata": { |
| 2188 | + "ExecuteTime": { |
| 2189 | + "end_time": "2022-04-27T07:37:48.430567Z", |
| 2190 | + "start_time": "2022-04-27T07:37:48.337182Z" |
| 2191 | + }, |
| 2192 | + "scrolled": true |
| 2193 | + }, |
| 2194 | + "outputs": [ |
| 2195 | + { |
| 2196 | + "name": "stdout", |
| 2197 | + "output_type": "stream", |
| 2198 | + "text": [ |
| 2199 | + "Runs: 100\n", |
| 2200 | + "353 sec\n", |
| 2201 | + "417 GB\n" |
| 2202 | + ] |
| 2203 | + } |
| 2204 | + ], |
| 2205 | + "source": [ |
| 2206 | + "FILES = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/SRR10002688_1.shuf.head1000.split.SRR10002688_1.shuf.head1000.*.fa.blast.log'\n", |
| 2207 | + "\n", |
| 2208 | + "lines = !grep 'wall' $FILES\n", |
| 2209 | + "\n", |
| 2210 | + "print('Runs:', len(lines))\n", |
| 2211 | + "print('{:.0f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n", |
| 2212 | + "\n", |
| 2213 | + "lines = !grep 'Maximum resident set size' $FILES\n", |
| 2214 | + "\n", |
| 2215 | + "print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2216 | + ] |
| 2217 | + }, |
| 2218 | + { |
| 2219 | + "cell_type": "code", |
| 2220 | + "execution_count": 196, |
| 2221 | + "id": "6da9777f", |
| 2222 | + "metadata": { |
| 2223 | + "ExecuteTime": { |
| 2224 | + "end_time": "2022-04-26T08:58:01.674094Z", |
| 2225 | + "start_time": "2022-04-26T08:58:01.592426Z" |
| 2226 | + } |
| 2227 | + }, |
| 2228 | + "outputs": [ |
| 2229 | + { |
| 2230 | + "name": "stdout", |
| 2231 | + "output_type": "stream", |
| 2232 | + "text": [ |
| 2233 | + "Runs: 1\n", |
| 2234 | + "1857 sec\n", |
| 2235 | + "428 GB\n" |
| 2236 | + ] |
| 2237 | + } |
| 2238 | + ], |
| 2239 | + "source": [ |
| 2240 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try2/SRR10002688_1.shuf.head1000.fa.refseq.blast.log'\n", |
| 2241 | + "\n", |
| 2242 | + "lines = !grep 'wall' $FILE\n", |
| 2243 | + "\n", |
| 2244 | + "print('Runs:', len(lines))\n", |
| 2245 | + "print('{:.0f} sec'.format(np.mean([to_seconds(x.split(' ')[-1]) for x in lines])))\n", |
| 2246 | + "\n", |
| 2247 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2248 | + "\n", |
| 2249 | + "print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2250 | + ] |
| 2251 | + }, |
| 2252 | + { |
| 2253 | + "cell_type": "markdown", |
| 2254 | + "id": "2751f9a0", |
| 2255 | + "metadata": { |
| 2256 | + "ExecuteTime": { |
| 2257 | + "end_time": "2022-04-24T16:03:29.832878Z", |
| 2258 | + "start_time": "2022-04-24T16:03:29.403798Z" |
| 2259 | + } |
| 2260 | + }, |
| 2261 | + "source": [ |
| 2262 | + "MegaBLAST" |
| 2263 | + ] |
| 2264 | + }, |
| 2265 | + { |
| 2266 | + "cell_type": "code", |
| 2267 | + "execution_count": 205, |
| 2268 | + "id": "72680fa3", |
| 2269 | + "metadata": { |
| 2270 | + "ExecuteTime": { |
| 2271 | + "end_time": "2022-04-27T07:29:01.194861Z", |
| 2272 | + "start_time": "2022-04-27T07:28:50.008327Z" |
| 2273 | + }, |
| 2274 | + "scrolled": true |
| 2275 | + }, |
| 2276 | + "outputs": [ |
| 2277 | + { |
| 2278 | + "name": "stdout", |
| 2279 | + "output_type": "stream", |
| 2280 | + "text": [ |
| 2281 | + "Runs: 100\n", |
| 2282 | + "Runs: 100\n", |
| 2283 | + "Runs: 100\n", |
| 2284 | + "Runs: 100\n", |
| 2285 | + "Runs: 100\n", |
| 2286 | + "12.5 sec\n", |
| 2287 | + "0.090 GB\n" |
| 2288 | + ] |
| 2289 | + } |
| 2290 | + ], |
| 2291 | + "source": [ |
| 2292 | + "times = []\n", |
| 2293 | + "rams = []\n", |
| 2294 | + "\n", |
| 2295 | + "for i in [0, 1, 2, 3, 4]:\n", |
| 2296 | + " prefix = 'fatnode_try3' if i >= 2 else 'fatnode_try4'\n", |
| 2297 | + " FILES = f'/cluster/work/grlab/projects/metagenome/data/refseq/queries/{prefix}/nobackup/refseq_{i}.SRR10002688_1.shuf.head1000.split.SRR10002688_1.shuf.head1000.*.fa.megablast.log'\n", |
| 2298 | + "\n", |
| 2299 | + " lines = !grep 'wall' $FILES\n", |
| 2300 | + " print('Runs:', len(lines))\n", |
| 2301 | + "\n", |
| 2302 | + " times.append(np.mean([to_seconds(x.split(' ')[-1]) for x in lines]))\n", |
| 2303 | + "\n", |
| 2304 | + " lines = !grep 'Maximum resident set size' $FILES\n", |
| 2305 | + " rams.append(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines]))\n", |
| 2306 | + "\n", |
| 2307 | + "print('{:.1f} sec'.format(np.sum(times)))\n", |
| 2308 | + "print('{:.3f} GB'.format(np.max(rams)))" |
| 2309 | + ] |
| 2310 | + }, |
| 2311 | + { |
| 2312 | + "cell_type": "code", |
| 2313 | + "execution_count": 216, |
| 2314 | + "id": "4e300d36", |
| 2315 | + "metadata": { |
| 2316 | + "ExecuteTime": { |
| 2317 | + "end_time": "2022-04-27T07:37:48.430567Z", |
| 2318 | + "start_time": "2022-04-27T07:37:48.337182Z" |
| 2319 | + }, |
| 2320 | + "scrolled": true |
| 2321 | + }, |
| 2322 | + "outputs": [ |
| 2323 | + { |
| 2324 | + "name": "stdout", |
| 2325 | + "output_type": "stream", |
| 2326 | + "text": [ |
| 2327 | + "/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_2.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 2:52.64\n", |
| 2328 | + "/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_3.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 6:21.01\n", |
| 2329 | + "/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try3/nobackup/refseq_4.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 3:15.47\n", |
| 2330 | + "/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try4/nobackup/refseq_0.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 6:10.30\n", |
| 2331 | + "/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try4/nobackup/refseq_1.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log:\tElapsed (wall clock) time (h:mm:ss or m:ss): 7:02.20\n", |
| 2332 | + "Runs: 5\n", |
| 2333 | + "1542 sec\n", |
| 2334 | + "22.0 GB\n" |
| 2335 | + ] |
| 2336 | + } |
| 2337 | + ], |
| 2338 | + "source": [ |
| 2339 | + "FILES = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_try*/nobackup/refseq_*.SRR10002688_1.shuf.head1000.SRR10002688_1.shuf.head1000.fa.megablast.log'\n", |
| 2340 | + "\n", |
| 2341 | + "lines = !grep 'wall' $FILES\n", |
| 2342 | + "for x in lines:\n", |
| 2343 | + " print(x)\n", |
| 2344 | + "\n", |
| 2345 | + "print('Runs:', len(lines))\n", |
| 2346 | + "print('{:.0f} sec'.format(np.sum([to_seconds(x.split(' ')[-1]) for x in lines])))\n", |
| 2347 | + "\n", |
| 2348 | + "lines = !grep 'Maximum resident set size' $FILES\n", |
| 2349 | + "\n", |
| 2350 | + "print('{:.1f} GB'.format(np.max([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2351 | + ] |
| 2352 | + }, |
| 2353 | + { |
| 2354 | + "cell_type": "markdown", |
| 2355 | + "id": "26b7b7b3", |
| 2356 | + "metadata": { |
| 2357 | + "ExecuteTime": { |
| 2358 | + "end_time": "2022-04-24T16:05:28.998442Z", |
| 2359 | + "start_time": "2022-04-24T16:05:28.513571Z" |
| 2360 | + } |
| 2361 | + }, |
| 2362 | + "source": [ |
| 2363 | + "MetaGraph" |
| 2364 | + ] |
| 2365 | + }, |
| 2366 | + { |
| 2367 | + "cell_type": "code", |
| 2368 | + "execution_count": 220, |
| 2369 | + "id": "671888f0", |
| 2370 | + "metadata": { |
| 2371 | + "ExecuteTime": { |
| 2372 | + "end_time": "2022-04-27T08:34:15.332054Z", |
| 2373 | + "start_time": "2022-04-27T08:34:15.250214Z" |
| 2374 | + } |
| 2375 | + }, |
| 2376 | + "outputs": [ |
| 2377 | + { |
| 2378 | + "name": "stdout", |
| 2379 | + "output_type": "stream", |
| 2380 | + "text": [ |
| 2381 | + "Runs: 100\n", |
| 2382 | + "0.66 sec\n", |
| 2383 | + "500 GB\n" |
| 2384 | + ] |
| 2385 | + } |
| 2386 | + ], |
| 2387 | + "source": [ |
| 2388 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head100.split.refseq.align.log'\n", |
| 2389 | + "\n", |
| 2390 | + "lines = !grep 'processed in' $FILE\n", |
| 2391 | + "\n", |
| 2392 | + "print('Runs:', len(lines))\n", |
| 2393 | + "print('{:.2f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n", |
| 2394 | + "\n", |
| 2395 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2396 | + "print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2397 | + ] |
| 2398 | + }, |
| 2399 | + { |
| 2400 | + "cell_type": "code", |
| 2401 | + "execution_count": 218, |
| 2402 | + "id": "56c3071f", |
| 2403 | + "metadata": { |
| 2404 | + "ExecuteTime": { |
| 2405 | + "end_time": "2022-04-27T08:33:30.383423Z", |
| 2406 | + "start_time": "2022-04-27T08:33:30.249041Z" |
| 2407 | + } |
| 2408 | + }, |
| 2409 | + "outputs": [ |
| 2410 | + { |
| 2411 | + "name": "stdout", |
| 2412 | + "output_type": "stream", |
| 2413 | + "text": [ |
| 2414 | + "Runs: 1\n", |
| 2415 | + "575 sec\n", |
| 2416 | + "513 GB\n" |
| 2417 | + ] |
| 2418 | + } |
| 2419 | + ], |
| 2420 | + "source": [ |
| 2421 | + "FILE = '/cluster/work/grlab/projects/metagenome/data/refseq/queries/fatnode_mtg_final/SRR10002688_1.shuf.head1000.fa.refseq.align.log'\n", |
| 2422 | + "\n", |
| 2423 | + "lines = !grep 'processed in' $FILE\n", |
| 2424 | + "\n", |
| 2425 | + "print('Runs:', len(lines))\n", |
| 2426 | + "print('{:.0f} sec'.format(np.mean([float(x.split(' ')[7]) for x in lines])))\n", |
| 2427 | + "\n", |
| 2428 | + "lines = !grep 'Maximum resident set size' $FILE\n", |
| 2429 | + "print('{:.0f} GB'.format(np.mean([float(x.split(' ')[-1]) / 1e6 for x in lines])))" |
| 2430 | + ] |
| 2431 | + }, |
| 2432 | + { |
| 2433 | + "cell_type": "markdown", |
| 2434 | + "id": "03209b5d", |
| 2435 | + "metadata": {}, |
| 2436 | + "source": [ |
| 2437 | + "## Illumina RNA-Seq reads" |
| 2438 | + ] |
| 2439 | + }, |
| 2440 | + { |
| 2441 | + "cell_type": "code", |
| 2442 | + "execution_count": 22, |
| 2443 | + "id": "3d872be3", |
| 2444 | + "metadata": { |
| 2445 | + "ExecuteTime": { |
| 2446 | + "end_time": "2022-01-17T19:59:44.156468Z", |
| 2447 | + "start_time": "2022-01-17T19:59:43.671793Z" |
| 2448 | + } |
| 2449 | + }, |
| 2450 | + "outputs": [], |
| 2451 | + "source": [ |
| 2452 | + "with open('kingsford_gzip_spring.pkl', 'rb') as f:\n", |
| 2453 | + " gzip, read_length, num_bp, gzip_size, fasta_size, spring_size = pickle.load(f)\n", |
| 2454 | + "\n", |
| 2455 | + "# gzip = !ls ~/metagenome/data/kingsford/compressed/*_no_header.fasta.gz\n", |
| 2456 | + "\n", |
| 2457 | + "# num_bp = []\n", |
| 2458 | + "# read_length = []\n", |
| 2459 | + "# spring_size = []\n", |
| 2460 | + "# gzip_size = []\n", |
| 2461 | + "# fasta_size = []\n", |
| 2462 | + "\n", |
| 2463 | + "# for x in tqdm(gzip, mininterval=1):\n", |
| 2464 | + "# gzip_size.append(get_size(x))\n", |
| 2465 | + "# fasta_size.append(get_size(x[:-len('.gz')]))\n", |
| 2466 | + "# ID = x[:-len('_no_header.fasta.gz')]\n", |
| 2467 | + "# spring_fname = f'{ID}_no_header.spring'\n", |
| 2468 | + "# spring_size.append(get_size(spring_fname))\n", |
| 2469 | + "# SRA = ID.split('/')[-1]\n", |
| 2470 | + "# fname = x[:-len('_no_header.fasta.gz')] + '.num_bp'\n", |
| 2471 | + "# with open(fname, 'r') as f:\n", |
| 2472 | + "# n = int(f.readline())\n", |
| 2473 | + "# num_bp.append(n)\n", |
| 2474 | + "\n", |
| 2475 | + "# with gz.open(x, 'r') as f:\n", |
| 2476 | + "# f.readline().strip().decode()\n", |
| 2477 | + "# seq = f.readline().strip().decode()\n", |
| 2478 | + "# L = len(seq)\n", |
| 2479 | + "# all_same_len = True\n", |
| 2480 | + "# for i in range(1000): # check 1000 first reads\n", |
| 2481 | + "# f.readline().strip().decode()\n", |
| 2482 | + "# if L != len(f.readline().strip().decode()):\n", |
| 2483 | + "# all_same_len = False\n", |
| 2484 | + "# break\n", |
| 2485 | + "# read_length.append(L if all_same_len and num_bp[-1] % L == 0 else np.nan)\n", |
| 2486 | + "\n", |
| 2487 | + "# gzip = np.array(gzip)\n", |
| 2488 | + "# num_bp = np.array(num_bp)\n", |
| 2489 | + "# spring_size = np.array(spring_size)\n", |
| 2490 | + "# gzip_size = np.array(gzip_size)\n", |
| 2491 | + "# fasta_size = np.array(fasta_size)\n", |
| 2492 | + "# read_length = np.array(read_length)\n", |
| 2493 | + "\n", |
| 2494 | + "# with open('kingsford_gzip_spring.pkl', 'wb') as f:\n", |
| 2495 | + "# pickle.dump((gzip, read_length, num_bp, gzip_size, fasta_size, spring_size), f)" |
| 2496 | + ] |
| 2497 | + }, |
| 2498 | + { |
| 2499 | + "cell_type": "code", |
| 2500 | + "execution_count": 23, |
| 2501 | + "id": "4ffcdd65", |
| 2502 | + "metadata": { |
| 2503 | + "ExecuteTime": { |
| 2504 | + "end_time": "2022-01-17T19:59:46.692236Z", |
| 2505 | + "start_time": "2022-01-17T19:59:46.667700Z" |
| 2506 | + }, |
| 2507 | + "scrolled": false |
| 2508 | + }, |
| 2509 | + "outputs": [ |
| 2510 | + { |
| 2511 | + "data": { |
| 2512 | + "text/plain": [ |
| 2513 | + "(2652, 7.973392066664)" |
| 2514 | + ] |
| 2515 | + }, |
| 2516 | + "execution_count": 23, |
| 2517 | + "metadata": {}, |
| 2518 | + "output_type": "execute_result" |
| 2519 | + } |
| 2520 | + ], |
| 2521 | + "source": [ |
| 2522 | + "len(num_bp), sum(num_bp) / 1e12" |
| 2523 | + ] |
| 2524 | + }, |
| 2525 | + { |
| 2526 | + "cell_type": "code", |
| 2527 | + "execution_count": 24, |
| 2528 | + "id": "cae57556", |
| 2529 | + "metadata": { |
| 2530 | + "ExecuteTime": { |
| 2531 | + "end_time": "2022-01-17T19:59:49.473344Z", |
| 2532 | + "start_time": "2022-01-17T19:59:49.448238Z" |
| 2533 | + } |
| 2534 | + }, |
| 2535 | + "outputs": [], |
| 2536 | + "source": [ |
| 2537 | + "with open('kingsford_mtg_size.pkl', 'rb') as f:\n", |
| 2538 | + " mtg_size = pickle.load(f)\n", |
| 2539 | + "\n", |
| 2540 | + "# mtg_size = []\n", |
| 2541 | + "\n", |
| 2542 | + "# DIR = !echo $HOME\n", |
| 2543 | + "# DIR = f'{DIR[0]}/metagenome/data/kingsford_31_coordinates/'\n", |
| 2544 | + "\n", |
| 2545 | + "# for x in tqdm(gzip, mininterval=1):\n", |
| 2546 | + "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
| 2547 | + "# graph_size = get_size(f'{DIR}/{ID}.fasta.gz/graph_small.dbg')\n", |
| 2548 | + "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n", |
| 2549 | + "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n", |
| 2550 | + "# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n", |
| 2551 | + "# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n", |
| 2552 | + "# mtg_size.append(graph_size + get_size(column_fname) + get_size(coords_fname))\n", |
| 2553 | + "\n", |
| 2554 | + "# mtg_size = np.array(mtg_size)\n", |
| 2555 | + "\n", |
| 2556 | + "# with open('kingsford_mtg_size.pkl', 'wb') as f:\n", |
| 2557 | + "# pickle.dump(mtg_size, f)" |
| 2558 | + ] |
| 2559 | + }, |
| 2560 | + { |
| 2561 | + "cell_type": "code", |
| 2562 | + "execution_count": 46, |
| 2563 | + "id": "09188d87", |
| 2564 | + "metadata": { |
| 2565 | + "ExecuteTime": { |
| 2566 | + "end_time": "2022-01-17T20:08:08.444781Z", |
| 2567 | + "start_time": "2022-01-17T20:08:08.421148Z" |
| 2568 | + }, |
| 2569 | + "scrolled": true |
| 2570 | + }, |
| 2571 | + "outputs": [], |
| 2572 | + "source": [ |
| 2573 | + "with open('kingsford_mtg_2_size.pkl', 'rb') as f:\n", |
| 2574 | + " mtg2_size = pickle.load(f)\n", |
| 2575 | + "\n", |
| 2576 | + "# mtg2_size = []\n", |
| 2577 | + "\n", |
| 2578 | + "# DIR = !echo $HOME\n", |
| 2579 | + "# DIR = f'{DIR[0]}/metagenome/finished_projects/counting_dbg/kingsford_31_coordinates_fork_opt_new'\n", |
| 2580 | + "\n", |
| 2581 | + "# for x in tqdm(gzip, mininterval=1):\n", |
| 2582 | + "# ID = x.split('/')[-1][:-len('_no_header.fasta.gz')]\n", |
| 2583 | + "# graph_size = get_size(f'{DIR}/../kingsford_31_coordinates/{ID}.fasta.gz/graph_small.dbg')\n", |
| 2584 | + "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.rd_succ')\n", |
1777 | 2585 | "# graph_size += get_size(f'{DIR}/{ID}.fasta.gz/graph.dbg.anchors')\n",
|
1778 | 2586 | "# column_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg'\n",
|
1779 | 2587 | "# coords_fname = f'{DIR}/{ID}.fasta.gz/rd_columns/annotation.column.annodbg.coords'\n",
|
|
2364 | 3172 | "plt.savefig('size_vs_k.pdf', format='pdf', bbox_inches='tight')\n",
|
2365 | 3173 | "plt.show()"
|
2366 | 3174 | ]
|
| 3175 | + }, |
| 3176 | + { |
| 3177 | + "cell_type": "markdown", |
| 3178 | + "id": "e098b928", |
| 3179 | + "metadata": {}, |
| 3180 | + "source": [ |
| 3181 | + "## Construction time" |
| 3182 | + ] |
| 3183 | + }, |
| 3184 | + { |
| 3185 | + "cell_type": "code", |
| 3186 | + "execution_count": 37, |
| 3187 | + "id": "041a2243", |
| 3188 | + "metadata": { |
| 3189 | + "ExecuteTime": { |
| 3190 | + "end_time": "2022-03-07T18:06:34.192082Z", |
| 3191 | + "start_time": "2022-03-07T18:06:33.727756Z" |
| 3192 | + }, |
| 3193 | + "scrolled": false |
| 3194 | + }, |
| 3195 | + "outputs": [ |
| 3196 | + { |
| 3197 | + "name": "stdout", |
| 3198 | + "output_type": "stream", |
| 3199 | + "text": [ |
| 3200 | + "Binary OLD:\n", |
| 3201 | + "KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n", |
| 3202 | + "Build contigs: 24.58 h, in 5304 runs, each using 1 cores\tNormalized time: 0.683 h \tmem: 50.69142 GB\n", |
| 3203 | + "Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n", |
| 3204 | + "Annotate graph: 1.11 h, in 10 runs, each using 18 cores\t\tNormalized time: 0.555 h \tmem: 51.589428 GB\n", |
| 3205 | + "Transform anno: 1.34 h, in 4 runs, each using 36 cores\t\tNormalized time: 1.337 h \tmem: 58.220948 GB\n" |
| 3206 | + ] |
| 3207 | + } |
| 3208 | + ], |
| 3209 | + "source": [ |
| 3210 | + "print('Binary OLD:')\n", |
| 3211 | + "\n", |
| 3212 | + "cores = 1\n", |
| 3213 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n", |
| 3214 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3215 | + "print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3216 | + "\n", |
| 3217 | + "cores = 1\n", |
| 3218 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_single.lsf | cut -d' ' -f8\n", |
| 3219 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3220 | + "print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3221 | + "\n", |
| 3222 | + "cores = 36\n", |
| 3223 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n", |
| 3224 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3225 | + "print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3226 | + "\n", |
| 3227 | + "cores = 18\n", |
| 3228 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | cut -d' ' -f8\n", |
| 3229 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3230 | + "print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3231 | + "\n", |
| 3232 | + "cores = 36\n", |
| 3233 | + "time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/logs/old_rd_* | cut -d' ' -f8\n", |
| 3234 | + "mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/logs/old_rd_* | tr -s ' ' | cut -f6 -d' '\n", |
| 3235 | + "print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3236 | + ] |
| 3237 | + }, |
| 3238 | + { |
| 3239 | + "cell_type": "code", |
| 3240 | + "execution_count": 38, |
| 3241 | + "id": "31ea264e", |
| 3242 | + "metadata": { |
| 3243 | + "ExecuteTime": { |
| 3244 | + "end_time": "2022-03-07T18:06:50.077412Z", |
| 3245 | + "start_time": "2022-03-07T18:06:49.740832Z" |
| 3246 | + } |
| 3247 | + }, |
| 3248 | + "outputs": [ |
| 3249 | + { |
| 3250 | + "name": "stdout", |
| 3251 | + "output_type": "stream", |
| 3252 | + "text": [ |
| 3253 | + "Binary:\n", |
| 3254 | + "KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n", |
| 3255 | + "Build contigs: 24.58 h, in 5304 runs, each using 1 cores\tNormalized time: 0.683 h \tmem: 50.69142 GB\n", |
| 3256 | + "Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n", |
| 3257 | + "Annotate graph: 1.11 h, in 10 runs, each using 18 cores\t\tNormalized time: 0.555 h \tmem: 51.589428 GB\n", |
| 3258 | + "Transform anno: 1.19 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.191 h \tmem: 59.31488 GB\n" |
| 3259 | + ] |
| 3260 | + } |
| 3261 | + ], |
| 3262 | + "source": [ |
| 3263 | + "print('Binary:')\n", |
| 3264 | + "\n", |
| 3265 | + "cores = 1\n", |
| 3266 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n", |
| 3267 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3268 | + "print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3269 | + "\n", |
| 3270 | + "cores = 1\n", |
| 3271 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_single.lsf | cut -d' ' -f8\n", |
| 3272 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3273 | + "print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3274 | + "\n", |
| 3275 | + "cores = 36\n", |
| 3276 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n", |
| 3277 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3278 | + "print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3279 | + "\n", |
| 3280 | + "cores = 18\n", |
| 3281 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | cut -d' ' -f8\n", |
| 3282 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3283 | + "print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3284 | + "\n", |
| 3285 | + "cores = 36\n", |
| 3286 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/rd_brwt.lsf | cut -d' ' -f8\n", |
| 3287 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3288 | + "print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3289 | + ] |
| 3290 | + }, |
| 3291 | + { |
| 3292 | + "cell_type": "code", |
| 3293 | + "execution_count": 39, |
| 3294 | + "id": "609cb85a", |
| 3295 | + "metadata": { |
| 3296 | + "ExecuteTime": { |
| 3297 | + "end_time": "2022-03-07T18:07:12.041814Z", |
| 3298 | + "start_time": "2022-03-07T18:07:10.691302Z" |
| 3299 | + } |
| 3300 | + }, |
| 3301 | + "outputs": [ |
| 3302 | + { |
| 3303 | + "name": "stdout", |
| 3304 | + "output_type": "stream", |
| 3305 | + "text": [ |
| 3306 | + "Smooth counts:\n", |
| 3307 | + "KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n", |
| 3308 | + "Build contigs: 31.10 h, in 5304 runs, each using 1 cores\tNormalized time: 0.864 h \tmem: 70.273636 GB\n", |
| 3309 | + "Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n", |
| 3310 | + "Annotate graph: 2.16 h, in 10 runs, each using 18 cores\t\tNormalized time: 1.080 h \tmem: 51.294724 GB\n", |
| 3311 | + "Transform anno: 1.26 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.256 h \tmem: 87.648508 GB\n" |
| 3312 | + ] |
| 3313 | + } |
| 3314 | + ], |
| 3315 | + "source": [ |
| 3316 | + "print('Smooth counts:')\n", |
| 3317 | + "\n", |
| 3318 | + "cores = 1\n", |
| 3319 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n", |
| 3320 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3321 | + "print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3322 | + "\n", |
| 3323 | + "cores = 1\n", |
| 3324 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/build_single.lsf | cut -d' ' -f8\n", |
| 3325 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3326 | + "print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3327 | + "\n", |
| 3328 | + "cores = 36\n", |
| 3329 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n", |
| 3330 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3331 | + "print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3332 | + "\n", |
| 3333 | + "cores = 18\n", |
| 3334 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/annotate_*.lsf | cut -d' ' -f8\n", |
| 3335 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3336 | + "print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3337 | + "\n", |
| 3338 | + "cores = 36\n", |
| 3339 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/count_rd_brwt.lsf | cut -d' ' -f8\n", |
| 3340 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1000000000/logs/count_rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3341 | + "print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3342 | + ] |
| 3343 | + }, |
| 3344 | + { |
| 3345 | + "cell_type": "code", |
| 3346 | + "execution_count": 40, |
| 3347 | + "id": "9d9db1da", |
| 3348 | + "metadata": { |
| 3349 | + "ExecuteTime": { |
| 3350 | + "end_time": "2022-03-07T18:07:40.094671Z", |
| 3351 | + "start_time": "2022-03-07T18:07:38.708276Z" |
| 3352 | + } |
| 3353 | + }, |
| 3354 | + "outputs": [ |
| 3355 | + { |
| 3356 | + "name": "stdout", |
| 3357 | + "output_type": "stream", |
| 3358 | + "text": [ |
| 3359 | + "Raw counts:\n", |
| 3360 | + "KMC counting: 115.96 h, in 2652 runs, each using 1 cores\tNormalized time: 3.221 h \tmem: 41.455688 GB\n", |
| 3361 | + "Build contigs: 29.86 h, in 5304 runs, each using 1 cores\tNormalized time: 0.829 h \tmem: 70.3226 GB\n", |
| 3362 | + "Build joint graph: 1.11 h, in 4 runs, each using 36 cores\tNormalized time: 1.115 h \tmem: 51.482024 GB\n", |
| 3363 | + "Annotate graph: 2.36 h, in 10 runs, each using 18 cores\t\tNormalized time: 1.179 h \tmem: 51.318896 GB\n", |
| 3364 | + "Transform anno: 1.40 h, in 5 runs, each using 36 cores\t\tNormalized time: 1.403 h \tmem: 87.64346 GB\n" |
| 3365 | + ] |
| 3366 | + } |
| 3367 | + ], |
| 3368 | + "source": [ |
| 3369 | + "print('Raw counts:')\n", |
| 3370 | + "\n", |
| 3371 | + "cores = 1\n", |
| 3372 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | cut -d' ' -f8\n", |
| 3373 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/kmc_count.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3374 | + "print(f'KMC counting: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3375 | + "\n", |
| 3376 | + "cores = 1\n", |
| 3377 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/build_single.lsf | cut -d' ' -f8\n", |
| 3378 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/build_single.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3379 | + "print(f'Build contigs: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3380 | + "\n", |
| 3381 | + "cores = 36\n", |
| 3382 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/logs/build_graph.lsf | cut -d' ' -f8\n", |
| 3383 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/logs/build_graph.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3384 | + "print(f'Build joint graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3385 | + "\n", |
| 3386 | + "cores = 18\n", |
| 3387 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/annotate_*.lsf | cut -d' ' -f8\n", |
| 3388 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/annotate_*.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3389 | + "print(f'Annotate graph: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')\n", |
| 3390 | + "\n", |
| 3391 | + "cores = 36\n", |
| 3392 | + "time = !grep Elapsed ~/metagenome/data/kingsford_21/smoothing_1/logs/count_rd_brwt.lsf | cut -d' ' -f8\n", |
| 3393 | + "mem = !grep 'Maximum resident' ~/metagenome/data/kingsford_21/smoothing_1/logs/count_rd_brwt.lsf | tr -s ' ' | cut -f6 -d' '\n", |
| 3394 | + "print(f'Transform anno: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t\\tNormalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3395 | + ] |
| 3396 | + }, |
| 3397 | + { |
| 3398 | + "cell_type": "code", |
| 3399 | + "execution_count": 107, |
| 3400 | + "id": "c0c15010", |
| 3401 | + "metadata": { |
| 3402 | + "ExecuteTime": { |
| 3403 | + "end_time": "2022-02-01T11:42:34.923564Z", |
| 3404 | + "start_time": "2022-02-01T11:41:36.035222Z" |
| 3405 | + } |
| 3406 | + }, |
| 3407 | + "outputs": [ |
| 3408 | + { |
| 3409 | + "name": "stdout", |
| 3410 | + "output_type": "stream", |
| 3411 | + "text": [ |
| 3412 | + "ntCard estimation (for Mantis): 72.77 h, in 2652 runs, each using 1 cores\tNormalized time: 2.021 h \tmem: 19.002412 GB\n" |
| 3413 | + ] |
| 3414 | + } |
| 3415 | + ], |
| 3416 | + "source": [ |
| 3417 | + "cores = 1\n", |
| 3418 | + "time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.ntcard.4.log | cut -d' ' -f8\n", |
| 3419 | + "mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.ntcard.4.log | tr -s ' ' | cut -f6 -d' '\n", |
| 3420 | + "print(f'ntCard estimation (for Mantis): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n", |
| 3421 | + " f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3422 | + ] |
| 3423 | + }, |
| 3424 | + { |
| 3425 | + "cell_type": "code", |
| 3426 | + "execution_count": 109, |
| 3427 | + "id": "4963e7de", |
| 3428 | + "metadata": { |
| 3429 | + "ExecuteTime": { |
| 3430 | + "end_time": "2022-02-01T12:11:24.741680Z", |
| 3431 | + "start_time": "2022-02-01T12:11:14.463792Z" |
| 3432 | + } |
| 3433 | + }, |
| 3434 | + "outputs": [ |
| 3435 | + { |
| 3436 | + "name": "stdout", |
| 3437 | + "output_type": "stream", |
| 3438 | + "text": [ |
| 3439 | + "CQF counting (for Mantis): 1274.37 h, in 2652 runs, each using 1 cores\tNormalized time: 35.399 h \tmem: 333.179748 GB\n" |
| 3440 | + ] |
| 3441 | + } |
| 3442 | + ], |
| 3443 | + "source": [ |
| 3444 | + "cores = 1\n", |
| 3445 | + "time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.4.stderr.log | cut -d' ' -f8\n", |
| 3446 | + "mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.4.stderr.log | tr -s ' ' | cut -f6 -d' '\n", |
| 3447 | + "print(f'CQF counting (for Mantis): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n", |
| 3448 | + " f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3449 | + ] |
| 3450 | + }, |
| 3451 | + { |
| 3452 | + "cell_type": "code", |
| 3453 | + "execution_count": 110, |
| 3454 | + "id": "bb2dc09c", |
| 3455 | + "metadata": { |
| 3456 | + "ExecuteTime": { |
| 3457 | + "end_time": "2022-02-01T12:11:42.846687Z", |
| 3458 | + "start_time": "2022-02-01T12:11:38.921845Z" |
| 3459 | + } |
| 3460 | + }, |
| 3461 | + "outputs": [ |
| 3462 | + { |
| 3463 | + "name": "stdout", |
| 3464 | + "output_type": "stream", |
| 3465 | + "text": [ |
| 3466 | + "CQF counting (for Mantis) with 1 thread: 695.35 h, in 2652 runs, each using 1 cores\tNormalized time: 19.315 h \tmem: 482.870516 GB\n" |
| 3467 | + ] |
| 3468 | + } |
| 3469 | + ], |
| 3470 | + "source": [ |
| 3471 | + "cores = 1\n", |
| 3472 | + "time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.2.stderr.log | cut -d' ' -f8\n", |
| 3473 | + "mem = !grep 'Maximum resident' ~/metagenome/finished_projects/counting_dbg/kingsford_21/mantis/nobackup/logs_reruns/*k21.2.stderr.log | tr -s ' ' | cut -f6 -d' '\n", |
| 3474 | + "print(f'CQF counting (for Mantis) with 1 thread: {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n", |
| 3475 | + " f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3476 | + ] |
| 3477 | + }, |
| 3478 | + { |
| 3479 | + "cell_type": "code", |
| 3480 | + "execution_count": 41, |
| 3481 | + "id": "67dcc514", |
| 3482 | + "metadata": { |
| 3483 | + "ExecuteTime": { |
| 3484 | + "end_time": "2022-03-08T09:13:40.212561Z", |
| 3485 | + "start_time": "2022-03-08T09:13:24.120477Z" |
| 3486 | + } |
| 3487 | + }, |
| 3488 | + "outputs": [ |
| 3489 | + { |
| 3490 | + "name": "stdout", |
| 3491 | + "output_type": "stream", |
| 3492 | + "text": [ |
| 3493 | + "BCALM (for REINDEER): 1462.83 h, in 2652 runs, each using 1 cores\tNormalized time: 40.634 h \tmem: 152.297732 GB\n" |
| 3494 | + ] |
| 3495 | + } |
| 3496 | + ], |
| 3497 | + "source": [ |
| 3498 | + "cores = 1\n", |
| 3499 | + "time = !grep Elapsed ~/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log | cut -d' ' -f8\n", |
| 3500 | + "mem = !grep 'Maximum resident' /cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log | tr -s ' ' | cut -f6 -d' '\n", |
| 3501 | + "print(f'BCALM (for REINDEER): {sum([to_seconds(s) for s in time]) / 3600:0.2f} h, in {len(time)} runs, each using {cores} cores\\t'\n", |
| 3502 | + " f'Normalized time: {sum([to_seconds(s) for s in time]) / (36/cores) / 3600:0.3f} h \\tmem: {np.sort([float(m) for m in mem])[::-1][:36//cores].sum() / 1e6} GB')" |
| 3503 | + ] |
| 3504 | + }, |
| 3505 | + { |
| 3506 | + "cell_type": "code", |
| 3507 | + "execution_count": 53, |
| 3508 | + "id": "e08d010c", |
| 3509 | + "metadata": { |
| 3510 | + "ExecuteTime": { |
| 3511 | + "end_time": "2022-03-08T09:31:28.410415Z", |
| 3512 | + "start_time": "2022-03-08T09:31:27.312138Z" |
| 3513 | + } |
| 3514 | + }, |
| 3515 | + "outputs": [ |
| 3516 | + { |
| 3517 | + "data": { |
| 3518 | + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEUCAYAAAA2ib1OAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPQUlEQVR4nO3dT1IbWdbG4fd01BwFLqY0ofIKhJh+k5ZHFZ6J9gosdoBrB114B+AVUDBqR41Er8DADqyoHnc3JVZwvkHexOm0/qRkpVIH/Z6ICqRUpvJIBa+uTuZNm7sLABDXX5ouAADwfQhyAAiOIAeA4AhyAAiOIAeA4AhyrIyZ/WlmQzO7MrM7MzubsE4nrXOXfp6WHv88Zbs/zaw1YflnM7uasPzMzFZ+Slap/q9eY5XXX1i3ZWZ36XbbzD4vUMNC6+P5I8ixasfufuzuh1IWqPkDZtaRdJXWOXT3V5KuC4+3JY0k9RfcZ3vCsp6kcdUnSME6KH+wlNbJ6z9J9R9KGpZWm/r6p3H3kbv/VLXW8vrpfcMWI8hRp0t9HbIflIXgOF/g7qPC4yeSzqWn0Ky8HzN7Cv+07c2CtZ5JOp6zTl7/U83uPms/5ddfl2++kWC7EOSoRQrTD5L+UVjcnhN8fXe/VjZKf7PA7srrv1EWopW5+4lmBGJq68yrv7j+pNc/9bkLbZZW3pYp/DzN2zkT1j+TlLd7pn6bwPP2Q9MF4Nm5MjMpG4leu/u99PT1/2HaRoW2ipSF8L8kvauyQ3cfldoLPXd/l+pYlZn1F0x8/QvqSPpbeg2urF3zKoV1R1/eJ6V1eqlNhS3FiByrduzur1IP9+lAZGpH7M7Y7kSp35zCr7VMe2XJtkoVIxXaJGlfwwkHVCe+/kX3VWg/jfTl9cx7D7GlCHLUxt0vlB10zI3MrDdl9b6ko3TGx5Wy0FqmvbJwW6WKFKz3ef3ufp1GweMZ25Rff1VfPWfxmAIwCa0V1CaFXrEd8VZZ6+EwD6c0gh5LGrv7cWHbjpZrr7TcvdI2S/im/lkmvP7amFmLwN9eBDlW7UPqEbfS/aferbvfm9mxsjDMWwSXkl4ona1SWvfBzDqFPvO/zCwPxlE6QFl0XtjvV9J511MD2MzOlUbPZnZU/FCZU/9vpdWmvv4a3Sh7b24nvCfYAsZlbLENzOyckMNzRZBjK5hZu3TOOvBsEOQAEBxnrQBAcAQ5AARHkANAcGs//fDHH3/0g4ODde8WAEK7u7v7r7vvTXps7UF+cHCg29vbde8WAEIzs39Pe4zWCgAER5ADQHAEOQAEt7YgN7PXZnbx+Pi4rl0CwFZYW5C7+0d3H+zs7KxrlwCwFWitAEBwBDkABEeQA0Bw/MMSFR388nsj+/3j158b2S+AOBiRA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwBDkABLeSIDeztpl1zOzUzFqreE4AQDWrGpE/SBql27srek4AQAVzg9zMemY2LC3rp+WnkuTu45rqAwDMMTfI3f2meN/M+oXl4xTogxTmN5L6dRQKAJhsmdbKkb60UUaSOpJuzawjqSfpYkW1AQAqWOZ65K3S/Rfufp9u32sCMxtIGkjS/v7+ErsEAEyzzIh8rAUPaLr7hbt33b27t7e3xC4BANMsE+Sf9GVU3pY0nL4qAKBuVc5a6UvqFg5yXktqm1kv3b+ZtT0AoF5ze+QpuK9Ly94vuiMzey3p9cuXLxfdFAAww9qm6Lv7R3cf7OzsrGuXALAVuNYKAAS3tiA3s9dmdvH4+LiuXQLAVqC1AgDB0VoBgOCWmdnZmINffm+6BADYOPTIASA4euQAEBw9cgAIjiAHgOAIcgAIjoOdABAcBzsBIDhaKwAQHEEOAMER5AAQHEEOAMFx1goABMdZKwAQHK0VAAiOIAeA4AhyAAiOIAeA4AhyAAiO0w8BIDhOPwSA4GitAEBwBDkABEeQA0BwBDkABEeQA0BwBDkABEeQA0BwTAgCgOCYEAQAwdFaAYDgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DgCHIACI4gB4DguGgWAATHRbMAIDhaKwAQ3A9NF4DZDn75vbF9//Hrz43tG0B1jMgBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCW8lFs8ysLaklqSfp2t1Hq3heAMB8qxqRdySNJN1I6q/oOQEAFcwNcjPrmdmwtKyflp9Kkrtfu/tYaUReS6UAgInmBrm73xTvm1m/sHxsZr20PA/xhxrqBABMsUyP/EjSZbo9ktQxM0l6l+4PxagcANZmmSBvle6/SKPzmwnrSpLMbCBpIEn7+/tL7BIAMM0yBzvHknYX2cDdL9y96+7dvb29JXYJAJhmmSD/pC+j8rayVgoAoCFVzlrpS+oWDnJeS2rnBznLB0NnPM9rM7t4fHz8nnoBACVze+QpuK9Ly94vuiN3/yjpY7fbfbvotgCA6ZiiDwDBEeQAENzagpweOQDUY21B7u4f3X2ws7Ozrl0CwFagtQIAwRHkABAcPXIACI4eOQAER2sFAIIjyAEgOIIcAILjYCcABMfBTgAIjtYKAARHkANAcAQ5AARHkANAcJy1AgDBcdYKAARHawUAgiPIASA4ghwAgiPIASA4ghwAguP0QwAIjtMPASA4WisAEBxBDgDBEeQAEBxBDgDBEeQAEBxBDgDBEeQAEBwTggAgOCYEAUBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACC41orABAc11oBgOBorQBAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcAQ5AARHkANAcCsLcjMbmFlvVc8HAKhmlSPyW0mtFT4fAKCCuUFuZj0zG5aW9dPy0/pKAwBUMTfI3f2meN/M+oXlY9opANCsZVorR5JG6fZIUifd7kk6MrPWCuoCAFT0wxLbtEr3X0iSu7+ftoGZDSQNJGl/f3+JXQIApllmRD6WtLvIBu5+4e5dd+/u7e0tsUsAwDTLBPknfRmVtyUNp68KAKjb3NZKOrjZNbO+u1+7+7WZneYHOcsHQ2c8z2tJr1++fPl9FWNtDn75vZH9/vHrz43sF4hqbpC7+7Wk69Kyqf3wGc/zUdLHbrf7dtFtAQDTMUUfAIIjyAEguLUFuZm9NrOLx8fHde0SALbC2oLc3T+6+2BnZ2dduwSArUBrBQCCI8gBILhlpugvhfPIgc3T1FwBifkCq0SPHACCo7UCAMER5AAQHEEOAMExIQgAguNgJwAER2sFAIIjyAEgOIIcAIIjyAEgOKboY+MwbXw78E8Jrg5nrQBAcLRWACA4ghwAgiPIASA4ghwAguOsFaCAMykQEWetAEBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDgmBAEboMlL926b53iZZCYEAUBwtFYAIDiCHACCI8gBIDiCHACCI8gBIDiCHACCI8gBIDiCHACCM3df7w7N/iPp3wts8qOk/9ZUTh2ot17UWy/qrdf31PtXd9+b9MDag3xRZnbr7t2m66iKeutFvfWi3nrVVS+tFQAIjiAHgOAiBPlF0wUsiHrrRb31ot561VLvxvfIAQCzRRiRAwBmIMgBILiNDnIz65tZz8xOm66lilTrsOk6qjCzVnp/+2Z21nQ986T3theh1qIo9ZrZn2Z2F6jeTv7723Qt86RaP6f3t5b3eGODPP8f5O43ksZm1mu4pLlSrVH8XdKuu19LkpkNGq5nKjPrSHqV3t+OmbWbrqmK9DsbolZJx+5+6O7vmi6kopP0u9sO8Puw6+4/ufuhpLeSzle9g40NcklHkkbp9khSp8Fanh13v3D3/Ah6W9LGfgi5+727vzOzlqSRu4/mbdO0FC4bX2dBK0AgSnoadNyZWdvd32/670NpgNeuo95NDvJW6f6LJop47tIf78Om/zEkXUnjpouoqJY/2BrtSnows5WPFmvwU/rvwczO0wf8xjOzQf4NeNU2OcjHyn65UK++u580XUQVaWTT2vS+qJn1grXZ8m9oY2VtzI1+f5PPqd47SRvbFix5VdcTb3KQf9KXUXlbUoiDiJGYWd/d36fbG3sMwszOCj38sTb/A/4hHZjtK+vhbnRb0MwGhfD+X6PFVPOpcLulAN/S6v7WsLFBXjiQ0Uv3N36Ek/4YuhFGNOl9PcuPpDddzxznkkap5laht7+RUk//RtkHTqvhcqr4TYUTCur6+r8qqb5Wod6N/n1IdiU91PXkzOwEgOA2dkQOAKiGIAeA4AhyAAiOIMdapangQzO7Sj8nnrecpjR/M5U5Lb+asPzMzEIc8ElTtoeTpmyX3p8wU+bRLIIcTTh292N3f6Vsht5XYV6YFTnt7J9JMxB7inEaWkfSlbIp5odp2nb51Nr8/TlM2xDmmIkgR9NulM3YLDpRuh7FlHOwL4uneKZ1Nv701OSDshB/mvU559TaS8W5XgsaQpCjMWmSxJm+vYhQP50rfC3pzYRNy8vfKAu8ifsonidvZp/Tz3ZqYQzzK1YW7t8VJ3Cktk3+2Hk+OcnMTtO6wyoTPtI67apzItIH1AdJ/6iyPrYXQY4mXKXw/FPSZXFCR+liU5eaMP06jWaLo9Seu98vWMOJpDN3f5VaPCrcPlf2AZPX007L7yVduftF4YqMh/lzVdhnW9UmheTvz5WkmyVeG7YMQY4m5P3xSTMIT5R6xinAWrPaK9/RVhlKOk+j7bb0NEofKLsmRvEyAK3083+F228k7aYDr2f6tj00yVcfQKn+4YSDtMfpQ+UnSRMP7gJFBDma9E7fjmT7ko7SWRtXysJvVntlaltlltTeyC9idFc4CHmrQqsnjf530wj5qDR9/V06KPl0YHLOPseS7otT4dMH2njGNhfKDuQCUxHkaEwKyZtCz7ktaVwIx2NJx5rdXulUaD20CrefRt/uPkr/kMKtpP9T1ua519ej5ray9sarVE/uUtm3h+J6VbxV9k2gNW/F9Lw91XiNDjwPPzRdALbembI2x4UKZ6vk3P3ezB7MbFJgn2vORancfWxmozSivk//SVLPzPJgHkn6p6ShmeUj5Ie0/Sgd1Oyn5Tfu/i7V9XSgNL2G9+lg6mEafU+q5z7t98rM8vbNb6XVPpiZCq+ttsuf4nngolnADOkc7s/pAGdL2Vkkl9OuEGhm51Gu747ng9YKMFtLX0bnY2Wj91nXQ2fyDtaOETkwQ2EU3kqLRoy4sWkIcgAIjtYKAARHkANAcAQ5AARHkANAcAQ5AARHkANAcP8PefBxlxmOJ7EAAAAASUVORK5CYII=\n", |
| 3519 | + "text/plain": [ |
| 3520 | + "<Figure size 432x288 with 1 Axes>" |
| 3521 | + ] |
| 3522 | + }, |
| 3523 | + "metadata": { |
| 3524 | + "needs_background": "light" |
| 3525 | + }, |
| 3526 | + "output_type": "display_data" |
| 3527 | + } |
| 3528 | + ], |
| 3529 | + "source": [ |
| 3530 | + "plt.hist(np.array(mem).astype(int) / 1e6)\n", |
| 3531 | + "plt.yscale('log')\n", |
| 3532 | + "plt.title('BCALM, 1 GB limit')\n", |
| 3533 | + "plt.xlabel('RAM usage, GB')\n", |
| 3534 | + "plt.show()" |
| 3535 | + ] |
| 3536 | + }, |
| 3537 | + { |
| 3538 | + "cell_type": "code", |
| 3539 | + "execution_count": 61, |
| 3540 | + "id": "2be4c3ca", |
| 3541 | + "metadata": { |
| 3542 | + "ExecuteTime": { |
| 3543 | + "end_time": "2022-03-08T11:09:15.458807Z", |
| 3544 | + "start_time": "2022-03-08T11:09:14.562628Z" |
| 3545 | + } |
| 3546 | + }, |
| 3547 | + "outputs": [ |
| 3548 | + { |
| 3549 | + "data": { |
| 3550 | + "text/plain": [ |
| 3551 | + "'/cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/SRR563547_build_k21.try4.stderr.log'" |
| 3552 | + ] |
| 3553 | + }, |
| 3554 | + "execution_count": 61, |
| 3555 | + "metadata": {}, |
| 3556 | + "output_type": "execute_result" |
| 3557 | + } |
| 3558 | + ], |
| 3559 | + "source": [ |
| 3560 | + "ids = !ls /cluster/work/grlab/projects/metagenome/finished_projects/counting_dbg/kingsford_21/reindeer/nobackup/logs_reruns/*.try4.stderr.log\n", |
| 3561 | + "ids[np.argmax(np.array(mem).astype(int))]" |
| 3562 | + ] |
2367 | 3563 | }
|
2368 | 3564 | ],
|
2369 | 3565 | "metadata": {
|
|
2386 | 3582 | },
|
2387 | 3583 | "toc": {
|
2388 | 3584 | "base_numbering": 1,
|
2389 |
| - "nav_menu": {}, |
| 3585 | + "nav_menu": { |
| 3586 | + "height": "191px", |
| 3587 | + "width": "221px" |
| 3588 | + }, |
2390 | 3589 | "number_sections": true,
|
2391 | 3590 | "sideBar": true,
|
2392 | 3591 | "skip_h1_title": true,
|
|
2397 | 3596 | "height": "calc(100% - 180px)",
|
2398 | 3597 | "left": "10px",
|
2399 | 3598 | "top": "150px",
|
2400 |
| - "width": "219px" |
| 3599 | + "width": "255px" |
2401 | 3600 | },
|
2402 | 3601 | "toc_section_display": true,
|
2403 | 3602 | "toc_window_display": true
|
|
0 commit comments