udacity
diff --git a/‎lesson-1-introduction-to-transformer-neural-networks/demos/02-char-tokenizer.ipynb
+23-19 b/‎lesson-1-introduction-to-transformer-neural-networks/demos/02-char-tokenizer.ipynb
+23-19
diff --git a/‎lesson-1-introduction-to-transformer-neural-networks/demos/03-implementing-the-attention-block.ipynb
+88-79 b/‎lesson-1-introduction-to-transformer-neural-networks/demos/03-implementing-the-attention-block.ipynb
+88-79
@@ -108,29 +108,33 @@
       "outputs": [],
       "source": [
         "class CharTokenizer:\n",
-        "  def __init__(self, vocabulary):\n",
-        "    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}\n",
-        "    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}\n",
+        "    def __init__(self, vocabulary):\n",
+        "        self.token_id_for_char = {\n",
+        "            char: token_id for token_id, char in enumerate(vocabulary)\n",
+        "        }\n",
+        "        self.char_for_token_id = {\n",
+        "            token_id: char for token_id, char in enumerate(vocabulary)\n",
+        "        }\n",
         "\n",
-        "  @staticmethod\n",
-        "  def train_from_text(text):\n",
-        "    vocabulary = set(text)\n",
-        "    return CharTokenizer(sorted(list(vocabulary)))\n",
+        "    @staticmethod\n",
+        "    def train_from_text(text):\n",
+        "        vocabulary = set(text)\n",
+        "        return CharTokenizer(sorted(list(vocabulary)))\n",
         "\n",
-        "  def encode(self, text):\n",
-        "    token_ids = []\n",
-        "    for char in text:\n",
-        "      token_ids.append(self.token_id_for_char[char])\n",
-        "    return torch.tensor(token_ids, dtype=torch.long)\n",
+        "    def encode(self, text):\n",
+        "        token_ids = []\n",
+        "        for char in text:\n",
+        "            token_ids.append(self.token_id_for_char[char])\n",
+        "        return torch.tensor(token_ids, dtype=torch.long)\n",
         "\n",
-        "  def decode(self, token_ids):\n",
-        "    chars = []\n",
-        "    for token_id in token_ids.tolist():\n",
-        "      chars.append(self.char_for_token_id[token_id])\n",
-        "    return ''.join(chars)\n",
+        "    def decode(self, token_ids):\n",
+        "        chars = []\n",
+        "        for token_id in token_ids.tolist():\n",
+        "            chars.append(self.char_for_token_id[token_id])\n",
+        "        return \"\".join(chars)\n",
         "\n",
-        "  def vocabulary_size(self):\n",
-        "    return len(self.token_id_for_char)"
+        "    def vocabulary_size(self):\n",
+        "        return len(self.token_id_for_char)"
       ]
     },
     {
 
@@ -18,13 +18,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",
-    "text = Path('../data/tiny-shakespeare.txt').read_text()"
+    "text = Path(\"../data/tiny-shakespeare.txt\").read_text()"
    ]
   },
   {
@@ -88,38 +88,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "id": "Ap_Ixr0M-0Yv"
    },
    "outputs": [],
    "source": [
-    "\n",
     "class CharTokenizer:\n",
-    "  def __init__(self, vocabulary):\n",
-    "    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}\n",
-    "    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}\n",
+    "    def __init__(self, vocabulary):\n",
+    "        self.token_id_for_char = {\n",
+    "            char: token_id for token_id, char in enumerate(vocabulary)\n",
+    "        }\n",
+    "        self.char_for_token_id = {\n",
+    "            token_id: char for token_id, char in enumerate(vocabulary)\n",
+    "        }\n",
     "\n",
-    "  @staticmethod\n",
-    "  def train_from_text(text):\n",
-    "    vocabulary = set(text)\n",
-    "    return CharTokenizer(sorted(list(vocabulary)))\n",
+    "    @staticmethod\n",
+    "    def train_from_text(text):\n",
+    "        vocabulary = set(text)\n",
+    "        return CharTokenizer(sorted(list(vocabulary)))\n",
     "\n",
-    "  def encode(self, text):\n",
-    "    token_ids = []\n",
-    "    for char in text:\n",
-    "      token_ids.append(self.token_id_for_char[char])\n",
-    "    return torch.tensor(token_ids, dtype=torch.long)\n",
+    "    def encode(self, text):\n",
+    "        token_ids = []\n",
+    "        for char in text:\n",
+    "            token_ids.append(self.token_id_for_char[char])\n",
+    "        return torch.tensor(token_ids, dtype=torch.long)\n",
     "\n",
-    "  def decode(self, token_ids):\n",
-    "    chars = []\n",
-    "    for token_id in token_ids.tolist():\n",
-    "      chars.append(self.char_for_token_id[token_id])\n",
-    "    return ''.join(chars)\n",
+    "    def decode(self, token_ids):\n",
+    "        chars = []\n",
+    "        for token_id in token_ids.tolist():\n",
+    "            chars.append(self.char_for_token_id[token_id])\n",
+    "        return \"\".join(chars)\n",
     "\n",
-    "\n",
-    "  def vocabulary_size(self):\n",
-    "    return len(self.token_id_for_char)"
+    "    def vocabulary_size(self):\n",
+    "        return len(self.token_id_for_char)"
    ]
   },
   {
@@ -175,84 +177,91 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "id": "7Qal76ig-94U"
    },
    "outputs": [],
    "source": [
     "from torch.utils.data import Dataset\n",
     "\n",
+    "\n",
     "class TokenIdsDataset(Dataset):\n",
-    "  def __init__(self, data, block_size):\n",
-    "    self.data = data\n",
-    "    self.block_size = block_size\n",
+    "    def __init__(self, data, block_size):\n",
+    "        self.data = data\n",
+    "        self.block_size = block_size\n",
     "\n",
-    "  def __len__(self):\n",
-    "    return len(self.data) - self.block_size\n",
+    "    def __len__(self):\n",
+    "        return len(self.data) - self.block_size\n",
     "\n",
-    "  def __getitem__(self, pos):\n",
-    "    assert pos < len(self.data) - self.block_size\n",
+    "    def __getitem__(self, pos):\n",
+    "        assert pos < len(self.data) - self.block_size\n",
     "\n",
-    "    x = self.data[pos:pos + self.block_size]\n",
-    "    y = self.data[pos + 1:pos + 1 + self.block_size]\n",
-    "    return x, y"
+    "        x = self.data[pos : pos + self.block_size]\n",
+    "        y = self.data[pos + 1 : pos + 1 + self.block_size]\n",
+    "        return x, y"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "config = {\n",
-    "  \"vocabulary_size\": tokenizer.vocabulary_size(),\n",
-    "  \"context_size\": 256,\n",
-    "  \"embedding_dim\": 768,\n",
-    "  \"heads_num\": 12,\n",
-    "  \"layers_num\": 10,\n",
-    "  \"dropout_rate\": 0.1,\n",
-    "  \"use_bias\": False,\n",
+    "    \"vocabulary_size\": tokenizer.vocabulary_size(),\n",
+    "    \"context_size\": 256,\n",
+    "    \"embedding_dim\": 768,\n",
+    "    \"heads_num\": 12,\n",
+    "    \"layers_num\": 10,\n",
+    "    \"dropout_rate\": 0.1,\n",
+    "    \"use_bias\": False,\n",
     "}\n",
     "\n",
     "config[\"head_size\"] = config[\"embedding_dim\"] // config[\"heads_num\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "class AttentionHead(nn.Module):\n",
-    "  def __init__(self, config):\n",
-    "    super().__init__()\n",
-    "    self.Q_weights = nn.Linear(config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"])\n",
-    "    self.K_weights = nn.Linear(config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"])\n",
-    "    self.V_weights = nn.Linear(config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"])\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__()\n",
+    "        self.Q_weights = nn.Linear(\n",
+    "            config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"]\n",
+    "        )\n",
+    "        self.K_weights = nn.Linear(\n",
+    "            config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"]\n",
+    "        )\n",
+    "        self.V_weights = nn.Linear(\n",
+    "            config[\"embedding_dim\"], config[\"head_size\"], config[\"use_bias\"]\n",
+    "        )\n",
     "\n",
-    "    self.dropout = nn.Dropout(config[\"dropout_rate\"])\n",
+    "        self.dropout = nn.Dropout(config[\"dropout_rate\"])\n",
     "\n",
-    "    casual_attention_mask = torch.tril(torch.ones(config[\"context_size\"], config[\"context_size\"]))\n",
-    "    self.register_buffer('casual_attention_mask', casual_attention_mask)\n",
+    "        casual_attention_mask = torch.tril(\n",
+    "            torch.ones(config[\"context_size\"], config[\"context_size\"])\n",
+    "        )\n",
+    "        self.register_buffer(\"casual_attention_mask\", casual_attention_mask)\n",
     "\n",
+    "    def forward(self, input):  # (B, C, embedding_dim)\n",
+    "        batch_size, tokens_num, embedding_dim = input.shape\n",
+    "        Q = self.Q_weights(input)  # (B, C, head_size)\n",
+    "        K = self.K_weights(input)  # (B, C, head_size)\n",
+    "        V = self.V_weights(input)  # (B, C, head_size)\n",
     "\n",
-    "  def forward(self, input): # (B, C, embedding_dim)\n",
-    "    batch_size, tokens_num, embedding_dim = input.shape\n",
-    "    Q = self.Q_weights(input) # (B, C, head_size)\n",
-    "    K = self.K_weights(input) # (B, C, head_size)\n",
-    "    V = self.V_weights(input) # (B, C, head_size)\n",
+    "        attention_scores = Q @ K.transpose(1, 2)  # (B, C, C)\n",
+    "        attention_scores = attention_scores.masked_fill(\n",
+    "            self.casual_attention_mask[:tokens_num, :tokens_num] == 0, -torch.inf\n",
+    "        )\n",
+    "        attention_scores = attention_scores / (K.shape[-1] ** 0.5)\n",
+    "        attention_scores = torch.softmax(attention_scores, dim=-1)\n",
+    "        attention_scores = self.dropout(attention_scores)\n",
     "\n",
-    "    attention_scores = Q @ K.transpose(1, 2)  # (B, C, C)\n",
-    "    attention_scores = attention_scores.masked_fill(\n",
-    "        self.casual_attention_mask[:tokens_num,:tokens_num] == 0,\n",
-    "        -torch.inf\n",
-    "    )\n",
-    "    attention_scores = attention_scores / ( K.shape[-1] ** 0.5 )\n",
-    "    attention_scores = torch.softmax(attention_scores, dim=-1)\n",
-    "    attention_scores = self.dropout(attention_scores)\n",
-    "\n",
-    "    return attention_scores @ V # (B, C, head_size)"
+    "        return attention_scores @ V  # (B, C, head_size)"
    ]
   },
   {
@@ -304,27 +313,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "class MultiHeadAttention(nn.Module):\n",
-    "  def __init__(self, config):\n",
-    "    super().__init__()\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__()\n",
     "\n",
-    "    heads_list = [AttentionHead(config) for _ in range(config[\"heads_num\"])]\n",
-    "    self.heads = nn.ModuleList(heads_list)\n",
+    "        heads_list = [AttentionHead(config) for _ in range(config[\"heads_num\"])]\n",
+    "        self.heads = nn.ModuleList(heads_list)\n",
     "\n",
-    "    self.linear = nn.Linear(config[\"embedding_dim\"], config[\"embedding_dim\"])\n",
-    "    self.dropout = nn.Dropout(config[\"dropout_rate\"])\n",
+    "        self.linear = nn.Linear(config[\"embedding_dim\"], config[\"embedding_dim\"])\n",
+    "        self.dropout = nn.Dropout(config[\"dropout_rate\"])\n",
     "\n",
-    "  def forward(self, input):\n",
-    "    heads_outputs = [head(input) for head in self.heads]\n",
+    "    def forward(self, input):\n",
+    "        heads_outputs = [head(input) for head in self.heads]\n",
     "\n",
-    "    scores_change = torch.cat(heads_outputs, dim=-1)\n",
+    "        scores_change = torch.cat(heads_outputs, dim=-1)\n",
     "\n",
-    "    scores_change = self.linear(scores_change)\n",
-    "    return self.dropout(scores_change)"
+    "        scores_change = self.linear(scores_change)\n",
+    "        return self.dropout(scores_change)"
    ]
   },
   {