.. only:: html .. note:: :class: sphx-glr-download-link-note Click :ref:`here ` to download the full example code .. rst-class:: sphx-glr-example-title .. _sphx_glr_examples_apps_datapreproc_datapreproc.py: Data Preprocessing App Example ==================================== This is a simple TorchX app that downloads some data via HTTP, normalizes the images via torchvision and then reuploads it via fsspec. .. code-block:: default import argparse import os import sys import tarfile import tempfile import zipfile from typing import List import fsspec from PIL import Image from torchvision import transforms from torchvision.datasets.folder import is_image_file from tqdm import tqdm def parse_args(argv: List[str]) -> argparse.Namespace: parser = argparse.ArgumentParser( description="example data preprocessing", ) parser.add_argument( "--input_path", type=str, help="dataset to download", default="http://cs231n.stanford.edu/tiny-imagenet-200.zip", ) parser.add_argument( "--output_path", type=str, help="remote path to save the .tar.gz data to", required=True, ) return parser.parse_args(argv) def download_and_extract_zip_archive(url: str, path: str) -> None: with fsspec.open(url, "rb") as f: with zipfile.ZipFile(f, "r") as zip_ref: zip_ref.extractall(path) def main(argv: List[str]) -> None: args = parse_args(argv) with tempfile.TemporaryDirectory() as tmpdir: print(f"downloading {args.input_path} to {tmpdir}...") download_and_extract_zip_archive(args.input_path, tmpdir) img_root = os.path.join( tmpdir, os.path.splitext(os.path.basename(args.input_path))[0], ) print(f"img_root={img_root}") print("transforming images...") transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)), transforms.ToPILImage(), ] ) image_files = [] for root, _, fnames in os.walk(img_root): for fname in fnames: path = os.path.join(root, fname) if not is_image_file(path): continue image_files.append(path) for path in tqdm(image_files, miniters=int(len(image_files) / 2000)): f = Image.open(path) f = transform(f) f.save(path) tar_path = os.path.join(tmpdir, "out.tar.gz") print(f"packing images into {tar_path}...") with tarfile.open(tar_path, mode="w:gz") as f: f.add(img_root, arcname="") print(f"uploading dataset to {args.output_path}...") fs, _, rpaths = fsspec.get_fs_token_paths(args.output_path) assert len(rpaths) == 1, "must have single output path" if fs.exists(rpaths[0]): fs.rm(rpaths[0]) fs.put(tar_path, rpaths[0]) if __name__ == "__main__" and "NOTEBOOK" not in globals(): main(sys.argv[1:]) # sphinx_gallery_thumbnail_path = '_static/img/gallery-app.png' .. rst-class:: sphx-glr-timing **Total running time of the script:** ( 0 minutes 0.000 seconds) .. _sphx_glr_download_examples_apps_datapreproc_datapreproc.py: .. only :: html .. container:: sphx-glr-footer :class: sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: datapreproc.py ` .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: datapreproc.ipynb ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_