Skip to content

Commit ef39047

Browse files
authored
Implement skip if file exists (#85)
* Fix skip if exists * Fix link * Update README.md
1 parent 7f14143 commit ef39047

File tree

5 files changed

+7731
-6251
lines changed

5 files changed

+7731
-6251
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
[![docs](https://github.com/worldbank/blackmarblepy/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/worldbank/blackmarblepy/actions/workflows/gh-pages.yml)
66
[![tests](https://github.com/worldbank/blackmarblepy/actions/workflows/tests.yml/badge.svg)](https://github.com/worldbank/blackmarblepy/actions/workflows/tests.yml)
77
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/worldbank/blackmarblepy/main.svg)](https://results.pre-commit.ci/latest/github/worldbank/blackmarblepy/main)
8-
[![downloads](https://static.pepy.tech/badge/blackmarblepy/month)](https://pepy.tech/project/blackmarblepy)
98
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10667907.svg)](https://zenodo.org/doi/10.5281/zenodo.10667907)
9+
[![Downloads](https://static.pepy.tech/badge/blackmarblepy)](https://pepy.tech/project/blackmarblepy)
1010
[![GitHub Repo stars](https://img.shields.io/github/stars/worldbank/blackmarblepy)](https://github.com/worldbank/blackmarblepy)
1111

12-
**BlackMarblePy** is a Python package that provides a simple way to use nighttime lights data from NASA's Black Marble project. [Black Marble](https://blackmarble.gsfc.nasa.gov) is a [NASA Earth Science Data Systems (ESDS)](https://www.earthdata.nasa.gov) project that provides a product suite of daily, monthly and yearly global [nighttime lights](https://www.earthdata.nasa.gov/learn/backgrounders/nighttime-lights). This package automates the process of downloading all relevant tiles from the [NASA LAADS DAAC](https://www.earthdata.nasa.gov/eosdis/daacs/laads) to cover a region of interest, converting the raw files (in HDF5 format), to georeferenced rasters, and mosaicing rasters together when needed.
12+
**BlackMarblePy** is a Python package that provides a simple way to use nighttime lights data from NASA's Black Marble project. [Black Marble](https://blackmarble.gsfc.nasa.gov) is a [NASA Earth Science Data Systems (ESDS)](https://www.earthdata.nasa.gov) project that provides a product suite of daily, monthly and yearly global [nighttime lights](https://www.earthdata.nasa.gov/learn/backgrounders/nighttime-lights). This package automates the process of downloading all relevant tiles from the [NASA LAADS DAAC](https://www.earthdata.nasa.gov/eosdis/daacs/laads) to cover a region of interest, converting the raw files (in HDF5 format), to georeferenced rasters, and mosaicking rasters together when needed.
1313

1414
## Features
1515

@@ -140,7 +140,7 @@ Robert Marty
140140

141141
## Citation
142142

143-
When using **BlackMarblePy**, your support is much appreciated! Please consider using the following citation or download [bibliography.bib](bibliography.bib):
143+
When using **BlackMarblePy**, your support is much appreciated! Please consider using the following citation or download [bibliography.bib](https://raw.githubusercontent.com/worldbank/blackmarblepy/main/docs/bibliography.bib):
144144

145145
```bibtex
146146
@misc{blackmarblepy,

notebooks/blackmarblepy.ipynb

+7,676-6,196
Large diffs are not rendered by default.

src/blackmarble/download.py

+36-24
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ async def get_manifest(
134134
def _download_file(
135135
self,
136136
name: str,
137+
skip_if_exists: bool = True,
137138
):
138139
"""Download NASA Black Marble file
139140
@@ -150,25 +151,25 @@ def _download_file(
150151
url = f"{self.URL}{name}"
151152
name = name.split("/")[-1]
152153

153-
with open(filename := Path(self.directory, name), "wb+") as f:
154-
with httpx.stream(
155-
"GET",
156-
url,
157-
headers={"Authorization": f"Bearer {self.bearer}"},
158-
) as response:
159-
total = int(response.headers["Content-Length"])
160-
with tqdm(
161-
total=total,
162-
unit="B",
163-
unit_scale=True,
164-
leave=None,
165-
) as pbar:
166-
pbar.set_description(f"Retrieving {name}...")
167-
for chunk in response.iter_raw():
168-
f.write(chunk)
169-
pbar.update(len(chunk))
170-
171-
return filename
154+
if not (filename := Path(self.directory, name)).exists() or not skip_if_exists:
155+
with open(filename, "wb+") as f:
156+
with httpx.stream(
157+
"GET",
158+
url,
159+
headers={"Authorization": f"Bearer {self.bearer}"},
160+
) as response:
161+
total = int(response.headers["Content-Length"])
162+
with tqdm(
163+
total=total,
164+
unit="B",
165+
unit_scale=True,
166+
leave=None,
167+
) as pbar:
168+
pbar.set_description(f"Downloading {name}...")
169+
for chunk in response.iter_raw():
170+
f.write(chunk)
171+
pbar.update(len(chunk))
172+
return filename
172173

173174
def download(
174175
self,
@@ -177,12 +178,13 @@ def download(
177178
date_range: List[datetime.date],
178179
skip_if_exists: bool = True,
179180
):
180-
"""Download (in parallel) from NASA Black Marble archive
181+
"""
182+
Downloads files asynchronously from NASA Black Marble archive.
181183
182184
Parameters
183185
----------
184186
gdf: geopandas.GeoDataFrame
185-
Region of Interest
187+
Region of Interest. Converted to EPSG:4326 and intersected with Black Mable tiles
186188
187189
product: Product
188190
Nasa Black Marble Product Id (e.g, VNP46A1)
@@ -192,22 +194,32 @@ def download(
192194
193195
skip_if_exists: bool, default=True
194196
Whether to skip downloading data if file already exists
197+
198+
Returns
199+
-------
200+
list: List[pathlib.Path]
201+
List of downloaded H5 filenames.
195202
"""
203+
# Convert to EPSG:4326 and intersect with self.TILES
196204
gdf = geopandas.overlay(
197205
gdf.to_crs("EPSG:4326").dissolve(), self.TILES, how="intersection"
198206
)
199207

208+
# Fetch manifest data asynchronously
200209
bm_files_df = asyncio.run(self.get_manifest(gdf, product_id, date_range))
210+
211+
# Filter files to those intersecting with Black Marble tiles
201212
bm_files_df = bm_files_df[
202213
bm_files_df["name"].str.contains("|".join(gdf["TileID"]))
203214
]
204-
names = bm_files_df["fileURL"].tolist()
205215

206-
args = [(name,) for name in names]
216+
# Prepare arguments for parallel download
217+
names = bm_files_df["fileURL"].tolist()
218+
args = [(name, skip_if_exists) for name in names]
207219
return pqdm(
208220
args,
209221
self._download_file,
210-
n_jobs=16,
222+
n_jobs=4, # os.cpu_count(),
211223
argument_type="args",
212224
desc="Downloading...",
213225
)

src/blackmarble/extract.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@ def bm_extract(
2020
variable: Optional[str] = None,
2121
drop_values_by_quality_flag: List[int] = [],
2222
check_all_tiles_exist: bool = True,
23-
file_directory: Optional[Path] = None,
24-
file_prefix: Optional[str] = None,
25-
file_skip_if_exists: bool = True,
23+
output_directory: Optional[Path] = None,
24+
output_skip_if_exists: bool = True,
2625
):
2726
"""Extract and aggregate nighttime lights zonal statistics from `NASA Black Marble <https://blackmarble.gsfc.nasa.gov>`_.
2827
@@ -76,13 +75,10 @@ def bm_extract(
7675
check_all_tiles_exist: bool, default=True
7776
Check whether all Black Marble nighttime light tiles exist for the region of interest. Sometimes not all tiles are available, so the full region of interest may not be covered. By default (True), it skips cases where not all tiles are available.
7877
79-
file_directory: pathlib.Path, optional
80-
Where to produce output. By default, the output will be produced onto a temporary directory.
78+
output_directory: pathlib.Path, optional
79+
Directory to produce output. By default, the output will be produced onto a temporary directory.
8180
82-
file_directory_prefix: str, optional
83-
Prefix
84-
85-
file_skip_if_exists: bool, default=True
81+
outout_skip_if_exists: bool, default=True
8682
Whether to skip downloading or extracting data if the data file for that date already exists.
8783
8884
bearer
@@ -102,9 +98,8 @@ def bm_extract(
10298
variable,
10399
drop_values_by_quality_flag,
104100
check_all_tiles_exist,
105-
file_directory,
106-
file_prefix,
107-
file_skip_if_exists,
101+
output_directory,
102+
output_skip_if_exists,
108103
)
109104

110105
results = []

src/blackmarble/raster.py

+9-16
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ def h5_to_geotiff(
140140
variable: str = None,
141141
drop_values_by_quality_flag: List[int] = [255],
142142
output_directory: Path = None,
143-
output_prefix: str = None,
144143
):
145144
"""
146145
Convert HDF5 file to GeoTIFF for a selected (or default) variable from NASA Black Marble data
@@ -164,9 +163,6 @@ def h5_to_geotiff(
164163
output_directory : Path, optional
165164
Directory to save the output GeoTIFF file. If None, uses the same directory as input file.
166165
167-
output_prefix : str, optional
168-
Prefix for the output file name. If None, uses the input file name.
169-
170166
Returns
171167
------
172168
output_path: Path
@@ -265,9 +261,8 @@ def bm_raster(
265261
variable: Optional[str] = None,
266262
drop_values_by_quality_flag: List[int] = [],
267263
check_all_tiles_exist: bool = True,
268-
file_directory: Optional[Path] = None,
269-
file_prefix: Optional[str] = None,
270-
file_skip_if_exists: bool = True,
264+
output_directory: Optional[Path] = None,
265+
output_skip_if_exists: bool = True,
271266
):
272267
"""Create a stack of nighttime lights rasters by retrieiving from `NASA Black Marble <https://blackmarble.gsfc.nasa.gov>`_ data.
273268
@@ -318,13 +313,10 @@ def bm_raster(
318313
check_all_tiles_exist: bool, default=True
319314
Check whether all Black Marble nighttime light tiles exist for the region of interest. Sometimes not all tiles are available, so the full region of interest may not be covered. By default (True), it skips cases where not all tiles are available.
320315
321-
file_directory: pathlib.Path, optional
322-
Where to produce output. By default, the output will be produced onto a temporary directory.
323-
324-
file_prefix: str, optional
325-
Prefix
316+
output_directory: pathlib.Path, optional
317+
Directory to produce output. By default, the output will be produced onto a temporary directory.
326318
327-
file_skip_if_exists: bool, default=True
319+
output_skip_if_exists: bool, default=True
328320
Whether to skip downloading or extracting data if the data file for that date already exists.
329321
330322
Returns
@@ -348,9 +340,11 @@ def bm_raster(
348340
date_range = sorted(set([d.replace(day=1, month=1) for d in date_range]))
349341

350342
# Download and construct Dataset
351-
with file_directory if file_directory else tempfile.TemporaryDirectory() as d:
343+
with output_directory if output_directory else tempfile.TemporaryDirectory() as d:
352344
downloader = BlackMarbleDownloader(bearer, d)
353-
pathnames = downloader.download(gdf, product_id, date_range)
345+
pathnames = downloader.download(
346+
gdf, product_id, date_range, output_skip_if_exists
347+
)
354348

355349
datasets = []
356350
for date in tqdm(date_range, desc="COLLATING RESULTS | Processing..."):
@@ -364,7 +358,6 @@ def bm_raster(
364358
f,
365359
variable=variable,
366360
drop_values_by_quality_flag=drop_values_by_quality_flag,
367-
output_prefix=file_prefix,
368361
output_directory=d,
369362
),
370363
)

0 commit comments

Comments
 (0)