diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5ba051a6..fad7ac27 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -17,12 +17,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os:
-          [
+        os: [
             "ubuntu-latest",
             "ubuntu-22.04-arm",
             "macos-latest",
-            "macos-13",
+            # "macos-13",
             "windows-2022",
           ]
         python-version: ["3.10"]
@@ -50,41 +49,47 @@ jobs:
           conda-remove-defaults: "true"
         if: matrix.os == 'macos-13'
 
-      - name: Install OS-specific conda dependencies
+      - name: Install OS-specific compilers
         run: |
           if [[ "${{ matrix.os }}" == "ubuntu-22.04-arm" ]]; then
-            conda install --file conda_deps_linux_aarch64.txt --channel conda-forge --override-channels
+            conda install gxx --channel conda-forge --override-channels
           elif [[ "${{ runner.os }}" == "Linux" ]]; then
-            conda install --file conda_deps_linux.txt --channel conda-forge --override-channels
+            conda install gxx --channel conda-forge --override-channels
           elif [[ "${{ runner.os }}" == "macOS" ]]; then
-            conda install --file conda_deps_osx.txt --channel conda-forge --override-channels
+            conda install clangxx llvm-openmp pybind11 --channel conda-forge --override-channels
           elif [[ "${{ runner.os }}" == "Windows" ]]; then
-            conda install --file conda_deps_win.txt --channel conda-forge --override-channels
+            conda install vc vc14_runtime vs2015_runtime --channel conda-forge --override-channels
           fi
 
-      - name: Install testing packages
-        run: conda install -y -c conda-forge flake8 pytest psutil
-
       - name: List the conda environment
         run: conda list
 
+      - name: Install testing packages
+        run: conda install -y -c conda-forge flake8 pytest psutil python-build
+
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
       - name: Build and install the package
         run: |
           if [[ "${{ runner.os }}" == "Windows" ]]; then
             export LIB="C:/Miniconda/envs/test/Library/lib"
-            pip -vv install .
-          else
-            pip -vv install .
           fi
+          python -m build            
+          pip install dist/*.whl
         env:
           WITH_CUDA: "0"
 
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      # - name: Install nnpops
+      #   if: matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'
+      #   run: conda install nnpops --channel conda-forge --override-channels
+
+      - name: List the conda environment
+        run: conda list
 
       - name: Run tests
         run: pytest -v -s --durations=10
diff --git a/.github/workflows/docs_build.yaml b/.github/workflows/docs_build.yaml
index 234ef185..3d0fb760 100644
--- a/.github/workflows/docs_build.yaml
+++ b/.github/workflows/docs_build.yaml
@@ -8,29 +8,30 @@ on:
     branches:
       - "main"
 
-
 jobs:
   build-docs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          python-version: "3.10"
+          channels: conda-forge
+          conda-remove-defaults: "true"
+
+      - name: Install compiler
+        run: conda install gxx --channel conda-forge --override-channels
 
-    - name: Set up Env
-      uses: mamba-org/setup-micromamba@v1
-      with:
-        environment-file: environment.yml
-        init-shell: bash
-        generate-run-shell: true
+      - name: Install docs dependencies
+        run: |
+          pip install -vv .
+          pip install -r docs/requirements.txt
+        shell: bash -el {0}
 
-    - name: Install docs dependencies
-      run: |
-        pip install -vv .
-        pip install -r docs/requirements.txt
-      shell: bash -el {0}
-      
-    - name: Build Sphinx Documentation
-      run: |
-        cd docs
-        make html
-      shell: bash -el {0}
+      - name: Build Sphinx Documentation
+        run: |
+          cd docs
+          make html
+        shell: bash -el {0}
diff --git a/conda_deps_linux.txt b/conda_deps_linux.txt
deleted file mode 100644
index daf6246a..00000000
--- a/conda_deps_linux.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-h5py
-nnpops
-pip
-libtorch >=2.5.1
-pytorch-cpu >=2.5.1
-pytorch_geometric
-lightning
-torchmetrics
-tqdm
-gxx
\ No newline at end of file
diff --git a/conda_deps_linux_aarch64.txt b/conda_deps_linux_aarch64.txt
deleted file mode 100644
index 457a495e..00000000
--- a/conda_deps_linux_aarch64.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-h5py
-pip
-libtorch >=2.5.1
-pytorch-cpu >=2.5.1
-pytorch_geometric
-lightning
-torchmetrics
-tqdm
-gxx
\ No newline at end of file
diff --git a/conda_deps_osx.txt b/conda_deps_osx.txt
deleted file mode 100644
index 3a293deb..00000000
--- a/conda_deps_osx.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-h5py
-nnpops
-pip
-libtorch>=2.5.1
-pytorch-cpu>=2.5.1
-pytorch_geometric
-lightning
-torchmetrics
-tqdm
-clangxx
-llvm-openmp
-pybind11
diff --git a/conda_deps_win.txt b/conda_deps_win.txt
deleted file mode 100644
index 1337da8b..00000000
--- a/conda_deps_win.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-h5py
-pip
-libtorch >=2.5.1
-pytorch-cpu >=2.5.1
-pytorch_geometric
-lightning
-torchmetrics
-tqdm
-vc
-vc14_runtime
-vs2015_runtime
-vs2019_win-64
-sleef
\ No newline at end of file
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 5ec760e3..078e7fcb 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -342,7 +342,7 @@ To implement a new architecture, you need to follow these steps:
                 **shared_args,
             )
 
-4. Add any new parameters required to initialize your module to scripts.train.get_args:
+4. Add any new parameters required to initialize your module to torchmdnet.scripts.train.get_args:
 
     .. code-block:: python
 
diff --git a/docs/source/torchmd-train.rst b/docs/source/torchmd-train.rst
index b12e7644..35ac8edb 100644
--- a/docs/source/torchmd-train.rst
+++ b/docs/source/torchmd-train.rst
@@ -89,7 +89,7 @@ Command line interface
 ~~~~~~~~~~~~~~~~~~~~~~
 
 
-.. autoprogram:: scripts.train:get_argparse()
+.. autoprogram:: torchmdnet.scripts.train:get_argparse()
    :prog: torchmd-train
 
 	     
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..3f217596
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,38 @@
+[project]
+name = "torchmd-net"
+description = "TorchMD-NET provides state-of-the-art neural networks potentials for biomolecular systems"
+authors = [{ name = "Acellera", email = "info@acellera.com" }]
+readme = "README.md"
+requires-python = ">=3.8"
+dynamic = ["version"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Operating System :: POSIX :: Linux",
+]
+dependencies = [
+  "h5py",
+  # "nnpops",
+  "torch==2.5.1.*",
+  "torch_geometric",
+  "lightning",
+  "tqdm",
+  "pandas",
+]
+
+[project.urls]
+"Homepage" = "https://github.com/torchmd/torchmd-net"
+"Bug Tracker" = "https://github.com/torchmd/torchmd-net/issues"
+
+[project.scripts]
+torchmd-train = "torchmdnet.scripts.train:main"
+
+[tool.setuptools_scm]
+
+[tool.setuptools.packages.find]
+where = [""]
+include = ["torchmdnet*"]
+namespaces = false
+
+[build-system]
+requires = ["setuptools>=64", "setuptools-scm>=8", "torch==2.5.1.*"]
+build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 07c14f0a..5c6c6fd7 100644
--- a/setup.py
+++ b/setup.py
@@ -2,33 +2,17 @@
 # Distributed under the MIT License.
 # (See accompanying file README.md file or copy at http://opensource.org/licenses/MIT)
 
-import subprocess
-from setuptools import setup, find_packages
+from setuptools import setup
 import torch
-from torch.utils.cpp_extension import (
-    BuildExtension,
-    CUDAExtension,
-    include_paths,
-    CppExtension,
-)
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
 import os
-import sys
-
-is_windows = sys.platform == "win32"
 
-try:
-    version = (
-        subprocess.check_output(["git", "describe", "--abbrev=0", "--tags"])
-        .strip()
-        .decode("utf-8")
-    )
-except Exception:
-    print("Failed to retrieve the current version, defaulting to 0")
-    version = "0"
 
 # If WITH_CUDA is defined
 if os.environ.get("WITH_CUDA", "0") == "1":
     use_cuda = True
+elif os.environ.get("WITH_CUDA", "0") == "0":
+    use_cuda = False
 else:
     use_cuda = torch.cuda._is_compiled()
 
@@ -37,13 +21,12 @@ def set_torch_cuda_arch_list():
     """Set the CUDA arch list according to the architectures the current torch installation was compiled for.
     This function is a no-op if the environment variable TORCH_CUDA_ARCH_LIST is already set or if torch was not compiled with CUDA support.
     """
-    if not os.environ.get("TORCH_CUDA_ARCH_LIST"):
-        if use_cuda:
-            arch_flags = torch._C._cuda_getArchFlags()
-            sm_versions = [x[3:] for x in arch_flags.split() if x.startswith("sm_")]
-            formatted_versions = ";".join([f"{y[0]}.{y[1]}" for y in sm_versions])
-            formatted_versions += "+PTX"
-            os.environ["TORCH_CUDA_ARCH_LIST"] = formatted_versions
+    if use_cuda and not os.environ.get("TORCH_CUDA_ARCH_LIST"):
+        arch_flags = torch._C._cuda_getArchFlags()
+        sm_versions = [x[3:] for x in arch_flags.split() if x.startswith("sm_")]
+        formatted_versions = ";".join([f"{y[0]}.{y[1]}" for y in sm_versions])
+        formatted_versions += "+PTX"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = formatted_versions
 
 
 set_torch_cuda_arch_list()
@@ -61,30 +44,15 @@ def set_torch_cuda_arch_list():
     name="torchmdnet.extensions.torchmdnet_extensions",
     sources=[os.path.join(extension_root, "torchmdnet_extensions.cpp")]
     + neighbor_sources,
-    include_dirs=include_paths(),
     define_macros=[("WITH_CUDA", 1)] if use_cuda else [],
 )
 
 if __name__ == "__main__":
     setup(
-        name="torchmd-net",
-        version=version,
-        packages=find_packages(),
         ext_modules=[extensions],
         cmdclass={
             "build_ext": BuildExtension.with_options(
                 no_python_abi_suffix=True, use_ninja=False
             )
         },
-        include_package_data=True,
-        entry_points={
-            "console_scripts": ["torchmd-train = torchmdnet.scripts.train:main"]
-        },
-        package_data={
-            "torchmdnet": (
-                ["extensions/torchmdnet_extensions.so"]
-                if not is_windows
-                else ["extensions/torchmdnet_extensions.dll"]
-            )
-        },
     )
diff --git a/tests/test_calculator.py b/tests/test_calculator.py
index 96d93d85..3345dd0a 100644
--- a/tests/test_calculator.py
+++ b/tests/test_calculator.py
@@ -3,7 +3,7 @@
 # (See accompanying file README.md file or copy at http://opensource.org/licenses/MIT)
 
 import torch
-from torch.testing import assert_allclose
+from torch.testing import assert_close
 import pytest
 from os.path import dirname, join
 from torchmdnet.calculators import External
@@ -15,6 +15,8 @@
 @pytest.mark.parametrize("box", [None, torch.eye(3)])
 @pytest.mark.parametrize("use_cuda_graphs", [True, False])
 def test_compare_forward(box, use_cuda_graphs):
+    from copy import deepcopy
+
     if use_cuda_graphs and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
     checkpoint = join(dirname(dirname(__file__)), "tests", "example.ckpt")
@@ -48,14 +50,19 @@ def test_compare_forward(box, use_cuda_graphs):
         checkpoint, z.unsqueeze(0), use_cuda_graph=use_cuda_graphs, device=device
     )
     calc.model = model
+    # Path the model
+    model = deepcopy(model)
+    model.representation_model.distance.check_errors = not use_cuda_graphs
+    model.representation_model.static_shapes = use_cuda_graphs
+    model.representation_model.distance.resize_to_fit = not use_cuda_graphs
     calc_graph.model = model
     if box is not None:
         box = (box * 2 * args["cutoff_upper"]).unsqueeze(0)
     for _ in range(10):
         e_calc, f_calc = calc.calculate(pos, box)
         e_pred, f_pred = calc_graph.calculate(pos, box)
-        assert_allclose(e_calc, e_pred)
-        assert_allclose(f_calc, f_pred)
+        assert_close(e_calc, e_pred)
+        assert_close(f_calc, f_pred)
 
 
 def test_compare_forward_multiple():
@@ -72,5 +79,5 @@ def test_compare_forward_multiple():
         torch.cat([torch.zeros(len(z1)), torch.ones(len(z2))]).long(),
     )
 
-    assert_allclose(e_calc, e_pred)
-    assert_allclose(f_calc, f_pred.view(-1, len(z1), 3))
+    assert_close(e_calc, e_pred)
+    assert_close(f_calc, f_pred.view(-1, len(z1), 3), rtol=1e-4, atol=1e-5)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 22b00249..8cdf0d1a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -10,6 +10,7 @@
 import psutil
 from torchmdnet.datasets import Custom, HDF5, Ace
 from torchmdnet.utils import write_as_hdf5
+from torch_geometric.loader import DataLoader
 import h5py
 import glob
 
@@ -297,3 +298,36 @@ def test_ace(tmpdir):
     assert len(dataset_v2) == 8
     f2.flush()
     f2.close()
+
+
+@mark.parametrize("num_files", [1, 3])
+@mark.parametrize("tile_embed", [True, False])
+@mark.parametrize("batch_size", [1, 5])
+def test_hdf5_with_and_without_caching(num_files, tile_embed, batch_size, tmpdir):
+    """This test ensures that the output from the get of the HDF5 dataset is the same
+    when the dataset is loaded with and without caching."""
+
+    # set up necessary files
+    _ = write_sample_npy_files(True, True, tmpdir, num_files)
+    files = {}
+    files["pos"] = sorted(glob.glob(join(tmpdir, "coords*")))
+    files["z"] = sorted(glob.glob(join(tmpdir, "embed*")))
+    files["y"] = sorted(glob.glob(join(tmpdir, "energy*")))
+    files["neg_dy"] = sorted(glob.glob(join(tmpdir, "forces*")))
+
+    write_as_hdf5(files, join(tmpdir, "test.hdf5"), tile_embed)
+    # Assert file is present in the disk
+    assert os.path.isfile(join(tmpdir, "test.hdf5")), "HDF5 file was not created"
+
+    data = HDF5(join(tmpdir, "test.hdf5"), dataset_preload_limit=0)  # no caching
+    data_cached = HDF5(join(tmpdir, "test.hdf5"), dataset_preload_limit=256)  # caching
+    assert len(data) == len(data_cached), "Number of samples does not match"
+
+    dl = DataLoader(data, batch_size)
+    dl_cached = DataLoader(data_cached, batch_size)
+
+    for sample_cached, sample in zip(dl_cached, dl):
+        assert np.allclose(sample_cached.pos, sample.pos), "Sample has incorrect coords"
+        assert np.allclose(sample_cached.z, sample.z), "Sample has incorrect atom numbers"
+        assert np.allclose(sample_cached.y, sample.y), "Sample has incorrect energy"
+        assert np.allclose(sample_cached.neg_dy, sample.neg_dy), "Sample has incorrect forces"
\ No newline at end of file
diff --git a/tests/test_equivariance.py b/tests/test_equivariance.py
index 1492a9f0..afad5604 100644
--- a/tests/test_equivariance.py
+++ b/tests/test_equivariance.py
@@ -24,7 +24,7 @@ def test_scalar_invariance():
 
     y = model(z, pos, batch)[0]
     y_rot = model(z, pos @ rotate, batch)[0]
-    torch.testing.assert_allclose(y, y_rot)
+    torch.testing.assert_close(y, y_rot)
 
 
 def test_vector_equivariance():
@@ -50,4 +50,4 @@ def test_vector_equivariance():
 
     y = model(z, pos, batch)[0]
     y_rot = model(z, pos @ rotate, batch)[0]
-    torch.testing.assert_allclose(y @ rotate, y_rot)
+    torch.testing.assert_close(y @ rotate, y_rot)
diff --git a/tests/test_prior_d2.py b/tests/test_prior_d2.py
index 7c55e734..b5a2aae7 100644
--- a/tests/test_prior_d2.py
+++ b/tests/test_prior_d2.py
@@ -86,4 +86,4 @@ def test_d2(test_case):
     y_init = pt.zeros_like(y_ref)
     y_res = prior.post_reduce(y_init, z, pos, batch)
 
-    pt.testing.assert_allclose(y_res, y_ref)
+    pt.testing.assert_close(y_res, y_ref)
diff --git a/tests/test_priors.py b/tests/test_priors.py
index 61c2166d..c77d4926 100644
--- a/tests/test_priors.py
+++ b/tests/test_priors.py
@@ -41,7 +41,7 @@ def test_atomref(model_name, enable_atomref):
         expected_offset = scatter(dataset.get_atomref().squeeze()[z], batch).unsqueeze(1)
     else:
         expected_offset = 0
-    torch.testing.assert_allclose(x_atomref, x_no_atomref + expected_offset)
+    torch.testing.assert_close(x_atomref, x_no_atomref + expected_offset)
 
 @mark.parametrize("trainable", [True, False])
 def test_atomref_trainable(trainable):
@@ -79,7 +79,7 @@ def compute_interaction(pos1, pos2, z1, z2):
     for i in range(len(pos)):
         for j in range(i):
             expected += compute_interaction(pos[i], pos[j], atomic_number[types[i]], atomic_number[types[j]])
-    torch.testing.assert_allclose(expected, energy)
+    torch.testing.assert_close(expected, energy, rtol=1e-4, atol=1e-4)
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
 def test_coulomb(dtype):
@@ -112,7 +112,7 @@ def compute_interaction(pos1, pos2, z1, z2):
     for i in range(len(pos)):
         for j in range(i):
             expected += compute_interaction(pos[i], pos[j], charge[i], charge[j])
-    torch.testing.assert_allclose(expected, energy)
+    torch.testing.assert_close(expected, energy, rtol=1e-4, atol=1e-4)
 
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
diff --git a/torchmdnet/__init__.py b/torchmdnet/__init__.py
index e69de29b..2abbf142 100644
--- a/torchmdnet/__init__.py
+++ b/torchmdnet/__init__.py
@@ -0,0 +1,7 @@
+from importlib.metadata import version, PackageNotFoundError
+
+try:
+    __version__ = version("torchmd-net")
+except PackageNotFoundError:
+    # package is not installed
+    pass
diff --git a/torchmdnet/datasets/hdf.py b/torchmdnet/datasets/hdf.py
index c647b7d2..179b637e 100644
--- a/torchmdnet/datasets/hdf.py
+++ b/torchmdnet/datasets/hdf.py
@@ -125,8 +125,13 @@ def get(self, idx):
             if self.index is None:
                 self._setup_index()
             *fields_data, i = self.index[idx]
+            # Assuming the first element of fields_data is 'pos' based on the definition of self.fields
+            size = len(fields_data[0])
             for (name, _, dtype), d in zip(self.fields, fields_data):
-                tensor_input = [[d[i]]] if d.ndim == 1 else d[i]
+                if d.ndim == 1:
+                    tensor_input = [d[i]] if len(d) == size else d[:]
+                else:
+                    tensor_input = d[i]
                 data[name] = torch.tensor(tensor_input, dtype=dtype)
         return data
 
diff --git a/torchmdnet/optimize.py b/torchmdnet/optimize.py
index 0c7f5651..ec8b831d 100644
--- a/torchmdnet/optimize.py
+++ b/torchmdnet/optimize.py
@@ -4,9 +4,6 @@
 
 from typing import Optional, Tuple
 import torch as pt
-from NNPOps.CFConv import CFConv
-from NNPOps.CFConvNeighbors import CFConvNeighbors
-
 from .models.model import TorchMD_Net
 from .models.torchmd_gn import TorchMD_GN
 
@@ -17,6 +14,8 @@ class TorchMD_GN_optimized(pt.nn.Module):
     """
 
     def __init__(self, model):
+        from NNPOps.CFConv import CFConv
+        from NNPOps.CFConvNeighbors import CFConvNeighbors
 
         if model.rbf_type != "gauss":
             raise ValueError('Only rbf_type="gauss" is supproted')
diff --git a/torchmdnet/utils.py b/torchmdnet/utils.py
index f9788855..1a1f7cd5 100644
--- a/torchmdnet/utils.py
+++ b/torchmdnet/utils.py
@@ -346,12 +346,13 @@ class MissingEnergyException(Exception):
     pass
 
 
-def write_as_hdf5(files, hdf5_dataset):
+def write_as_hdf5(files, hdf5_dataset, tile_embed=True):
     """Transform the input numpy files to hdf5 format compatible with the HDF5 Dataset class.
     The input files to this function are the same as the ones required by the Custom dataset.
     Args:
         files (dict): Dictionary of numpy input files. Must contain "pos", "z" and at least one of "y" or "neg_dy".
         hdf5_dataset (string): Path to the output HDF5 dataset.
+        tile_embed (bool): Whether to tile the embeddings to match the number of samples. Default: True
     Example:
         >>> files = {}
         >>> files["pos"] = sorted(glob.glob(join(tmpdir, "coords*")))
@@ -370,7 +371,10 @@ def write_as_hdf5(files, hdf5_dataset):
             group = f.create_group(str(i))
             num_samples = coord_data.shape[0]
             group.create_dataset("pos", data=coord_data)
-            group.create_dataset("types", data=np.tile(embed_data, (num_samples, 1)))
+            if tile_embed:
+                group.create_dataset("types", data=np.tile(embed_data, (num_samples, 1)))
+            else:
+                group.create_dataset("types", data=embed_data)
             if "y" in files:
                 energy_data = np.load(files["y"][i], mmap_mode="r")
                 group.create_dataset("energy", data=energy_data)