|
90 | 90 | "metadata": {},
|
91 | 91 | "outputs": [],
|
92 | 92 | "source": [
|
| 93 | + "import java.nio.charset.*;\n", |
93 | 94 | "import java.util.zip.*;\n",
|
94 | 95 | "import java.util.stream.*;"
|
95 | 96 | ]
|
|
132 | 133 | "metadata": {},
|
133 | 134 | "outputs": [],
|
134 | 135 | "source": [
|
135 |
| - "public static StringBuilder readDataNMT() throws IOException {\n", |
136 |
| - " File file = new File(\"./fra-eng.zip\");\n", |
137 |
| - " if (!file.exists()) {\n", |
138 |
| - " InputStream inputStream =\n", |
139 |
| - " new URL(\"http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip\").openStream();\n", |
140 |
| - " Files.copy(\n", |
141 |
| - " inputStream, Paths.get(\"./fra-eng.zip\"), StandardCopyOption.REPLACE_EXISTING);\n", |
142 |
| - " }\n", |
143 |
| - "\n", |
144 |
| - " ZipFile zipFile = new ZipFile(file);\n", |
| 136 | + "public static String readDataNMT() throws IOException {\n", |
| 137 | + " DownloadUtils.download(\n", |
| 138 | + " \"http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip\", \"fra-eng.zip\");\n", |
| 139 | + " ZipFile zipFile = new ZipFile(new File(\"fra-eng.zip\"));\n", |
145 | 140 | " Enumeration<? extends ZipEntry> entries = zipFile.entries();\n",
|
146 |
| - " InputStream stream = null;\n", |
147 | 141 | " while (entries.hasMoreElements()) {\n",
|
148 | 142 | " ZipEntry entry = entries.nextElement();\n",
|
149 | 143 | " if (entry.getName().contains(\"fra.txt\")) {\n",
|
150 |
| - " stream = zipFile.getInputStream(entry);\n", |
151 |
| - " break;\n", |
| 144 | + " InputStream stream = zipFile.getInputStream(entry);\n", |
| 145 | + " return new String(stream.readAllBytes(), StandardCharsets.UTF_8);\n", |
152 | 146 | " }\n",
|
153 | 147 | " }\n",
|
154 |
| - "\n", |
155 |
| - " String[] lines;\n", |
156 |
| - " try (BufferedReader in = new BufferedReader(new InputStreamReader(stream))) {\n", |
157 |
| - " lines = in.lines().toArray(String[]::new);\n", |
158 |
| - " }\n", |
159 |
| - " StringBuilder output = new StringBuilder();\n", |
160 |
| - " for (int i = 0; i < lines.length; i++) {\n", |
161 |
| - " output.append(lines[i] + \"\\n\");\n", |
162 |
| - " }\n", |
163 |
| - " return output;\n", |
| 148 | + " return null;\n", |
164 | 149 | "}\n",
|
165 | 150 | "\n",
|
166 |
| - "StringBuilder rawText = readDataNMT();\n", |
| 151 | + "String rawText = readDataNMT();\n", |
167 | 152 | "System.out.println(rawText.substring(0, 75));"
|
168 | 153 | ]
|
169 | 154 | },
|
|
188 | 173 | "metadata": {},
|
189 | 174 | "outputs": [],
|
190 | 175 | "source": [
|
191 |
| - "public static StringBuilder preprocessNMT(String text) {\n", |
| 176 | + "public static String preprocessNMT(String text) {\n", |
192 | 177 | " // Replace non-breaking space with space, and convert uppercase letters to\n",
|
193 | 178 | " // lowercase ones\n",
|
194 | 179 | "\n",
|
|
204 | 189 | " }\n",
|
205 | 190 | " out.append(currChar);\n",
|
206 | 191 | " }\n",
|
207 |
| - " return out;\n", |
| 192 | + " return out.toString();\n", |
208 | 193 | "}\n",
|
209 | 194 | "\n",
|
210 | 195 | "public static boolean noSpace(Character currChar, Character prevChar) {\n",
|
|
213 | 198 | " && prevChar != ' ';\n",
|
214 | 199 | "}\n",
|
215 | 200 | "\n",
|
216 |
| - "StringBuilder text = preprocessNMT(rawText.toString());\n", |
| 201 | + "String text = preprocessNMT(rawText);\n", |
217 | 202 | "System.out.println(text.substring(0, 80));"
|
218 | 203 | ]
|
219 | 204 | },
|
|
281 | 266 | "metadata": {},
|
282 | 267 | "outputs": [],
|
283 | 268 | "source": [
|
284 |
| - "for (String[] subArr : target.subList(0, 6)) System.out.println(Arrays.toString(subArr));" |
| 269 | + "for (String[] subArr : target.subList(0, 6)) {\n", |
| 270 | + " System.out.println(Arrays.toString(subArr));\n", |
| 271 | + "}" |
285 | 272 | ]
|
286 | 273 | },
|
287 | 274 | {
|
|
407 | 394 | "public static int[] truncatePad(Integer[] integerLine, int numSteps, int paddingToken) {\n",
|
408 | 395 | " /* Truncate or pad sequences */\n",
|
409 | 396 | " int[] line = Arrays.stream(integerLine).mapToInt(i -> i).toArray();\n",
|
410 |
| - " if (line.length > numSteps) return Arrays.copyOfRange(line, 0, numSteps);\n", |
| 397 | + " if (line.length > numSteps) {\n", |
| 398 | + " return Arrays.copyOfRange(line, 0, numSteps);\n", |
| 399 | + " }\n", |
411 | 400 | " int[] paddingTokenArr = new int[numSteps - line.length]; // Pad\n",
|
412 |
| - " for (int i = 0; i < paddingTokenArr.length; i++) paddingTokenArr[i] = paddingToken;\n", |
| 401 | + " Arrays.fill(paddingTokenArr, paddingToken);\n", |
413 | 402 | "\n",
|
414 | 403 | " return IntStream.concat(Arrays.stream(line), Arrays.stream(paddingTokenArr)).toArray();\n",
|
415 | 404 | "}\n",
|
|
451 | 440 | "outputs": [],
|
452 | 441 | "source": [
|
453 | 442 | "public static Pair<NDArray, NDArray> buildArrayNMT(\n",
|
454 |
| - " ArrayList<String[]> lines, Vocab vocab, int numSteps) {\n", |
| 443 | + " List<String[]> lines, Vocab vocab, int numSteps) {\n", |
455 | 444 | " /* Transform text sequences of machine translation into minibatches. */\n",
|
456 | 445 | " List<Integer[]> linesIntArr = new ArrayList<>();\n",
|
457 |
| - " for (int i = 0; i < lines.size(); i++) {\n", |
458 |
| - " linesIntArr.add(vocab.getIdxs(lines.get(i)));\n", |
| 446 | + " for (String[] strings : lines) {\n", |
| 447 | + " linesIntArr.add(vocab.getIdxs(strings));\n", |
459 | 448 | " }\n",
|
460 | 449 | " for (int i = 0; i < linesIntArr.size(); i++) {\n",
|
461 |
| - " ArrayList<Integer> temp = new ArrayList<>();\n", |
462 |
| - " temp.addAll(Arrays.asList(linesIntArr.get(i)));\n", |
| 450 | + " List<Integer> temp = new ArrayList<>(Arrays.asList(linesIntArr.get(i)));\n", |
463 | 451 | " temp.add(vocab.getIdx(\"<eos>\"));\n",
|
464 |
| - " linesIntArr.set(i, temp.stream().toArray(n -> new Integer[n]));\n", |
| 452 | + " linesIntArr.set(i, temp.toArray(new Integer[0]));\n", |
465 | 453 | " }\n",
|
466 | 454 | "\n",
|
| 455 | + " NDManager manager = NDManager.newBaseManager();\n", |
| 456 | + "\n", |
467 | 457 | " NDArray arr = manager.create(new Shape(linesIntArr.size(), numSteps), DataType.INT32);\n",
|
468 | 458 | " int row = 0;\n",
|
469 | 459 | " for (Integer[] line : linesIntArr) {\n",
|
|
498 | 488 | "public static Pair<ArrayDataset, Pair<Vocab, Vocab>> loadDataNMT(\n",
|
499 | 489 | " int batchSize, int numSteps, int numExamples) throws IOException {\n",
|
500 | 490 | " /* Return the iterator and the vocabularies of the translation dataset. */\n",
|
501 |
| - " StringBuilder text = preprocessNMT(readDataNMT().toString());\n", |
502 |
| - " Pair<ArrayList<String[]>, ArrayList<String[]>> pair =\n", |
503 |
| - " tokenizeNMT(text.toString(), numExamples);\n", |
| 491 | + " String text = preprocessNMT(readDataNMT());\n", |
| 492 | + " Pair<ArrayList<String[]>, ArrayList<String[]>> pair = tokenizeNMT(text, numExamples);\n", |
504 | 493 | " ArrayList<String[]> source = pair.getKey();\n",
|
505 | 494 | " ArrayList<String[]> target = pair.getValue();\n",
|
506 | 495 | " Vocab srcVocab =\n",
|
507 | 496 | " new Vocab(\n",
|
508 |
| - " source.stream().toArray(String[][]::new),\n", |
| 497 | + " source.toArray(String[][]::new),\n", |
509 | 498 | " 2,\n",
|
510 | 499 | " new String[] {\"<pad>\", \"<bos>\", \"<eos>\"});\n",
|
511 | 500 | " Vocab tgtVocab =\n",
|
512 | 501 | " new Vocab(\n",
|
513 |
| - " target.stream().toArray(String[][]::new),\n", |
| 502 | + " target.toArray(String[][]::new),\n", |
514 | 503 | " 2,\n",
|
515 | 504 | " new String[] {\"<pad>\", \"<bos>\", \"<eos>\"});\n",
|
516 | 505 | "\n",
|
|
582 | 571 | "1. Try different values of the `numExamples` argument in the `loadDataNMT` function. How does this affect the vocabulary sizes of the source language and the target language?\n",
|
583 | 572 | "1. Text in some languages such as Chinese and Japanese does not have word boundary indicators (e.g., space). Is word-level tokenization still a good idea for such cases? Why or why not?\n"
|
584 | 573 | ]
|
585 |
| - }, |
586 |
| - { |
587 |
| - "cell_type": "code", |
588 |
| - "execution_count": null, |
589 |
| - "metadata": {}, |
590 |
| - "outputs": [], |
591 |
| - "source": [] |
592 | 574 | }
|
593 | 575 | ],
|
594 | 576 | "metadata": {
|
|
0 commit comments