Merge remote-tracking branch 'upstream/main' into add-export-task

# Conflicts: # yolo/model/yolo.py
MultimediaTechLab · Feb 20, 2025 · bab9d4d · bab9d4d
2 parents a5cbf06 + 1d28355
commit bab9d4d
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,7 @@ runs
 */data
 
 # Datasets and model checkpoints
+*.ckpt
 *.pth
 *.pt
 *.trt

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 >
 > Use of this code is at your own risk and discretion. It is advisable to consult with the project owner before deploying or integrating into any critical systems. -->
 
-Welcome to the official implementation of YOLOv7 and YOLOv9, YOLO-RD. This repository will contains the complete codebase, pre-trained models, and detailed instructions for training and deploying YOLOv9.
+Welcome to the official implementation of YOLOv7[^1] and YOLOv9[^2], YOLO-RD[^3]. This repository will contains the complete codebase, pre-trained models, and detailed instructions for training and deploying YOLOv9.
 
 ## TL;DR
 
@@ -133,3 +133,9 @@ Contributions to the YOLO project are welcome! See [CONTRIBUTING](docs/CONTRIBUT
 }
 
 ```
+
+[^1]: [**YOLOv7**: Trainable Bag-of-Freebies Sets New State-of-the-Art for Real-Time Object Detectors](https://arxiv.org/abs/2207.02696)
+
+[^2]: [**YOLOv9**: Learning What You Want to Learn Using Programmable Gradient Information](https://arxiv.org/abs/2402.13616)
+
+[^3]: [**YOLO-RD**: Introducing Relevant and Compact Explicit Knowledge to YOLO by Retriever-Dictionary](https://arxiv.org/abs/2410.15346)
diff --git a/yolo/config/config.py b/yolo/config/config.py
@@ -166,6 +166,7 @@ class YOLOLayer(nn.Module):
     tags: str
     layer_type: str
     usable: bool
+    external: Optional[dict]
 
 
 IDX_TO_ID = [

diff --git a/yolo/config/model/v9-t.yaml b/yolo/config/model/v9-t.yaml
@@ -0,0 +1,133 @@
+name: v9-t
+
+anchor:
+  reg_max: 16
+
+model:
+  backbone:
+    - Conv:
+        args: {out_channels: 16, kernel_size: 3, stride: 2}
+        source: 0
+    - Conv:
+        args: {out_channels: 32, kernel_size: 3, stride: 2}
+    - ELAN:
+        args: {out_channels: 32, part_channels: 32}
+
+    - AConv:
+        args: {out_channels: 64}
+    - RepNCSPELAN:
+        args:
+            out_channels: 64
+            part_channels: 64
+            csp_args: {repeat_num: 3}
+        tags: B3
+
+    - AConv:
+        args: {out_channels: 96}
+    - RepNCSPELAN:
+        args:
+            out_channels: 96
+            part_channels: 96
+            csp_args: {repeat_num: 3}
+        tags: B4
+
+    - AConv:
+        args: {out_channels: 128}
+    - RepNCSPELAN:
+        args:
+            out_channels: 128
+            part_channels: 128
+            csp_args: {repeat_num: 3}
+        tags: B5
+
+  neck:
+    - SPPELAN:
+        args: {out_channels: 128}
+        tags: N3
+
+    - UpSample:
+        args: {scale_factor: 2, mode: nearest}
+    - Concat:
+        source: [-1, B4]
+    - RepNCSPELAN:
+        args:
+            out_channels: 96
+            part_channels: 96
+            csp_args: {repeat_num: 3}
+        tags: N4
+
+  head:
+    - UpSample:
+        args: {scale_factor: 2, mode: nearest}
+    - Concat:
+        source: [-1, B3]
+
+    - RepNCSPELAN:
+        args:
+            out_channels: 64
+            part_channels: 64
+            csp_args: {repeat_num: 3}
+        tags: P3
+    - AConv:
+        args: {out_channels: 48}
+    - Concat:
+        source: [-1, N4]
+
+    - RepNCSPELAN:
+        args:
+            out_channels: 96
+            part_channels: 96
+            csp_args: {repeat_num: 3}
+        tags: P4
+    - AConv:
+        args: {out_channels: 64}
+    - Concat:
+        source: [-1, N3]
+
+    - RepNCSPELAN:
+        args:
+            out_channels: 128
+            part_channels: 128
+            csp_args: {repeat_num: 3}
+        tags: P5
+
+  detection:
+    - MultiheadDetection:
+        source: [P3, P4, P5]
+        tags: Main
+        output: True
+
+  auxiliary:
+    - SPPELAN:
+        source: B5
+        args: {out_channels: 128}
+        tags: A5
+
+    - UpSample:
+        args: {scale_factor: 2, mode: nearest}
+    - Concat:
+        source: [-1, B4]
+
+    - RepNCSPELAN:
+        args:
+            out_channels: 96
+            part_channels: 96
+            csp_args: {repeat_num: 3}
+        tags: A4
+
+    - UpSample:
+        args: {scale_factor: 2, mode: nearest}
+    - Concat:
+        source: [-1, B3]
+
+    - RepNCSPELAN:
+        args:
+            out_channels: 64
+            part_channels: 64
+            csp_args: {repeat_num: 3}
+        tags: A3
+
+    - MultiheadDetection:
+        source: [A3, A4, A5]
+        tags: AUX
+        output: True
diff --git a/yolo/model/yolo.py b/yolo/model/yolo.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 from omegaconf import ListConfig, OmegaConf
@@ -68,30 +68,34 @@ def build_model(self, model_arch: Dict[str, List[Dict[str, Dict[str, Dict]]]]):
                 setattr(layer, "out_c", out_channels)
             layer_idx += 1
 
-    def forward(self, x):
-        y = {0: x}
+    def forward(self, x, external: Optional[Dict] = None, shortcut: Optional[str] = None):
+        y = {0: x, **(external or {})}
         output = dict()
-        
+
         # Use a simple loop instead of enumerate()
         # Needed for torch export compatibility
-        index = 1  
-        for layer in self.model:  
+        index = 1
+        for layer in self.model:
             if isinstance(layer.source, list):
                 model_input = [y[idx] for idx in layer.source]
             else:
                 model_input = y[layer.source]
-
-            x = layer(model_input)
+
+            external_input = {source_name: y[source_name] for source_name in layer.external}
+
+            x = layer(model_input, **external_input)
             y[-1] = x
-            
+
             if layer.usable:
                 y[index] = x
-            
+
             if layer.output:
                 output[layer.tags] = x
-
+                if layer.tags == shortcut:
+                    return output
+
             index += 1
-        
+
         return output
 
     def get_out_channels(self, layer_type: str, layer_args: dict, output_dim: list, source: Union[int, list]):
@@ -123,6 +127,7 @@ def create_layer(self, layer_type: str, source: Union[int, list], layer_info: Di
             setattr(layer, "in_c", kwargs.get("in_channels", None))
             setattr(layer, "output", layer_info.get("output", False))
             setattr(layer, "tags", layer_info.get("tags", None))
+            setattr(layer, "external", layer_info.get("external", []))
             setattr(layer, "usable", 0)
             return layer
         else:

diff --git a/yolo/tools/solver.py b/yolo/tools/solver.py
@@ -50,10 +50,10 @@ def validation_step(self, batch, batch_idx):
         batch_size, images, targets, rev_tensor, img_paths = batch
         H, W = images.shape[2:]
         predicts = self.post_process(self.ema(images), image_size=[W, H])
-        self.metric.update(
+        mAP = self.metric(
             [to_metrics_format(predict) for predict in predicts], [to_metrics_format(target) for target in targets]
         )
-        return predicts
+        return predicts, mAP
 
     def on_validation_epoch_end(self):
         epoch_metrics = self.metric.compute()

diff --git a/yolo/utils/logging_utils.py b/yolo/utils/logging_utils.py
@@ -117,6 +117,20 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch: Any, batch_idx:
         self.progress.update(self.train_progress_bar_id, description=batch_descript)
         self.refresh()
 
+    @override
+    @rank_zero_only
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
+        if self.is_disabled:
+            return
+        if trainer.sanity_checking:
+            self._update(self.val_sanity_progress_bar_id, batch_idx + 1)
+        elif self.val_progress_bar_id is not None:
+            self._update(self.val_progress_bar_id, batch_idx + 1)
+            _, mAP = outputs
+            mAP_desc = f" mAP :{mAP['map']*100:6.2f} | mAP50 :{mAP['map_50']*100:6.2f} |"
+            self.progress.update(self.val_progress_bar_id, description=f"[green]Valid [white]|{mAP_desc}")
+        self.refresh()
+
     @override
     @rank_zero_only
     def on_train_end(self, trainer: "Trainer", pl_module: "LightningModule") -> None:
@@ -212,8 +226,9 @@ def on_validation_batch_end(self, trainer: Trainer, pl_module, outputs, batch, b
         if batch_idx != 0:
             return
         batch_size, images, targets, rev_tensor, img_paths = batch
+        predicts, _ = outputs
         gt_boxes = targets[0] if targets.ndim == 3 else targets
-        pred_boxes = outputs[0] if isinstance(outputs, list) else outputs
+        pred_boxes = predicts[0] if isinstance(predicts, list) else predicts
         images = [images[0]]
         step = trainer.current_epoch
         for logger in trainer.loggers:
-Original file line number
+Diff line change
@@ Expand Up / @@ -117,6 +117,7 @@ runs @@
     */data
     # Datasets and model checkpoints
+    *.ckpt
     *.pth
     *.pt
     *.trt
@@ Expand Down @@