facebookresearch · amrzv · Nov 12, 2022 · Nov 12, 2022
diff --git a/inference_tutorial.ipynb b/inference_tutorial.ipynb
@@ -50,7 +50,7 @@
         "import sys\n",
         "\n",
         "\n",
-        "!{sys.executable} -m pip install einops pytorchvideo timm -q\n",
+        "!{sys.executable} -m pip install einops pytorchvideo timm hydra -q\n",
         "\n",
         "# only needed for the tutorial\n",
         "# if the video rendering doesn't work, restart the kernel after installation\n",
@@ -74,8 +74,6 @@
       },
       "outputs": [],
       "source": [
-        "import os \n",
-        "\n",
         "try:\n",
         "    from omnivore.transforms import SpatialCrop, TemporalCrop, DepthNorm\n",
         "except:\n",
@@ -87,10 +85,8 @@
         "\n",
         "import csv\n",
         "import json\n",
-        "from typing import List\n",
         "\n",
         "import torch\n",
-        "import torch.nn.functional as F\n",
         "import torchvision.transforms as T\n",
         "from PIL import Image\n",
         "from pytorchvideo.data.encoded_video import EncodedVideo\n",
@@ -104,7 +100,6 @@
         "\n",
         "%matplotlib inline\n",
         "import matplotlib.pyplot as plt\n",
-        "import matplotlib.image as mpimg\n",
         "from ipywidgets import Video"
       ]
     },
@@ -134,9 +129,9 @@
       "source": [
         "# Device on which to run the model\n",
         "# Set to cuda to load on GPU\n",
-        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" \n",
+        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
         "\n",
-        "# Pick a pretrained model \n",
+        "# Pick a pretrained model\n",
         "model_name = \"omnivore_swinB\"\n",
         "model = torch.hub.load(\"facebookresearch/omnivore:main\", model=model_name, force_reload=True)\n",
         "\n",
@@ -185,7 +180,7 @@
         "# Create an id to label name mapping\n",
         "imagenet_id_to_classname = {}\n",
         "for k, v in imagenet_classnames.items():\n",
-        "    imagenet_id_to_classname[k] = v[1] "
+        "    imagenet_id_to_classname[k] = v[1]"
       ]
     },
     {
@@ -348,8 +343,8 @@
         "    key=\"video\",\n",
         "    transform=T.Compose(\n",
         "        [\n",
-        "            UniformTemporalSubsample(num_frames), \n",
-        "            T.Lambda(lambda x: x / 255.0),  \n",
+        "            UniformTemporalSubsample(num_frames),\n",
+        "            T.Lambda(lambda x: x / 255.0),\n",
         "            ShortSideScale(size=224),\n",
         "            NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
         "            TemporalCrop(frames_per_clip=32, stride=40),\n",
@@ -390,7 +385,7 @@
       "outputs": [],
       "source": [
         "# Load the example video\n",
-        "video_path = \"dance.mp4\" \n",
+        "video_path = \"dance.mp4\"\n",
         "\n",
         "Video.from_file(video_path, width=500)"
       ]
@@ -406,7 +401,7 @@
         "# We crop the video to a smaller resolution and duration to save RAM\n",
         "!ffmpeg -y -ss 0 -i dance.mp4 -filter:v scale=224:-1 -t 1 -v 0 dance_cropped.mp4\n",
         "\n",
-        "video_path = \"dance_cropped.mp4\" "
+        "video_path = \"dance_cropped.mp4\""
       ]
     },
     {
@@ -430,7 +425,7 @@
         "# Move the inputs to the desired device\n",
         "video_inputs = video_data[\"video\"]\n",
         "\n",
-        "# Take the first clip \n",
+        "# Take the first clip\n",
         "# The model expects inputs of shape: B x C x T x H x W\n",
         "video_input = video_inputs[0][None, ...]"
       ]
@@ -452,11 +447,11 @@
       },
       "outputs": [],
       "source": [
-        "# Pass the input clip through the model \n",
+        "# Pass the input clip through the model\n",
         "with torch.no_grad():\n",
         "    prediction = model(video_input.to(device), input_type=\"video\")\n",
         "\n",
-        "    # Get the predicted classes \n",
+        "    # Get the predicted classes\n",
         "    pred_classes = prediction.topk(k=5).indices\n",
         "\n",
         "# Map the predicted classes to the label names\n",
@@ -531,7 +526,7 @@
         "        T.Resize(224),\n",
         "        T.CenterCrop(224),\n",
         "        T.Normalize(\n",
-        "            mean=[0.485, 0.456, 0.406, 0.0418], \n",
+        "            mean=[0.485, 0.456, 0.406, 0.0418],\n",
         "            std=[0.229, 0.224, 0.225, 0.0295]\n",
         "        ),\n",
         "    ]\n",
@@ -559,7 +554,7 @@
       "source": [
         "# Download the example image and disparity file\n",
         "!wget -O store.png https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Interior_of_the_IKEA_B%C4%83neasa_33.jpg/791px-Interior_of_the_IKEA_B%C4%83neasa_33.jpg\n",
-        "!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt "
+        "!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt"
       ]
     },
     {
@@ -731,8 +726,8 @@
         "    key=\"video\",\n",
         "    transform=T.Compose(\n",
         "        [\n",
-        "            UniformTemporalSubsample(num_frames), \n",
-        "            T.Lambda(lambda x: x / 255.0),  \n",
+        "            UniformTemporalSubsample(num_frames),\n",
+        "            T.Lambda(lambda x: x / 255.0),\n",
         "            ShortSideScale(size=224),\n",
         "            NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
         "            TemporalCrop(frames_per_clip=32, stride=40),\n",
@@ -773,7 +768,7 @@
       "outputs": [],
       "source": [
         "# Load the example video\n",
-        "video_path = \"epic.mp4\" \n",
+        "video_path = \"epic.mp4\"\n",
         "\n",
         "Video.from_file(video_path, width=500)"
       ]
@@ -789,7 +784,7 @@
         "# We crop the video to a smaller resolution and duration to save RAM\n",
         "!ffmpeg -y -ss 0 -i epic.mp4 -filter:v scale=224:-1 -t 1 -v 0 epic_cropped.mp4\n",
         "\n",
-        "video_path = \"epic_cropped.mp4\" "
+        "video_path = \"epic_cropped.mp4\""
       ]
     },
     {
@@ -812,7 +807,7 @@
         "# Move the inputs to the desired device\n",
         "video_inputs = video_data[\"video\"]\n",
         "\n",
-        "# Take the first clip \n",
+        "# Take the first clip\n",
         "# The model expects inputs of shape: B x C x T x H x W\n",
         "video_input = video_inputs[0][None, ...]"
       ]
@@ -834,11 +829,11 @@
       },
       "outputs": [],
       "source": [
-        "# Pass the input clip through the model \n",
+        "# Pass the input clip through the model\n",
         "with torch.no_grad():\n",
         "    prediction = model(video_input.to(device), input_type=\"video\")\n",
         "\n",
-        "    # Get the predicted classes \n",
+        "    # Get the predicted classes\n",
         "    pred_classes = prediction.topk(k=5).indices\n",
         "\n",
         "# Map the predicted classes to the label names\n",

diff --git a/omnimae/README.md b/omnimae/README.md
@@ -54,8 +54,8 @@ If this work is helpful in your research, please consider starring :star: us and
 ```
 
 ## Contributing
-We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information.
+We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information.
 
 ## License
-OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details).
+OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details).
 
diff --git a/omnivore/README.md b/omnivore/README.md
@@ -116,8 +116,8 @@ If this work is helpful in your research, please consider starring :star: us and
 ```
 
 ## Contributing
-We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information.
+We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information.
 
 ## License
-Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details).
+Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details).