Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 20 additions & 25 deletions inference_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"import sys\n",
"\n",
"\n",
"!{sys.executable} -m pip install einops pytorchvideo timm -q\n",
"!{sys.executable} -m pip install einops pytorchvideo timm hydra -q\n",
"\n",
"# only needed for the tutorial\n",
"# if the video rendering doesn't work, restart the kernel after installation\n",
Expand All @@ -74,8 +74,6 @@
},
"outputs": [],
"source": [
"import os \n",
"\n",
"try:\n",
" from omnivore.transforms import SpatialCrop, TemporalCrop, DepthNorm\n",
"except:\n",
Expand All @@ -87,10 +85,8 @@
"\n",
"import csv\n",
"import json\n",
"from typing import List\n",
"\n",
"import torch\n",
"import torch.nn.functional as F\n",
"import torchvision.transforms as T\n",
"from PIL import Image\n",
"from pytorchvideo.data.encoded_video import EncodedVideo\n",
Expand All @@ -104,7 +100,6 @@
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.image as mpimg\n",
"from ipywidgets import Video"
]
},
Expand Down Expand Up @@ -134,9 +129,9 @@
"source": [
"# Device on which to run the model\n",
"# Set to cuda to load on GPU\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\" \n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# Pick a pretrained model \n",
"# Pick a pretrained model\n",
"model_name = \"omnivore_swinB\"\n",
"model = torch.hub.load(\"facebookresearch/omnivore:main\", model=model_name, force_reload=True)\n",
"\n",
Expand Down Expand Up @@ -185,7 +180,7 @@
"# Create an id to label name mapping\n",
"imagenet_id_to_classname = {}\n",
"for k, v in imagenet_classnames.items():\n",
" imagenet_id_to_classname[k] = v[1] "
" imagenet_id_to_classname[k] = v[1]"
]
},
{
Expand Down Expand Up @@ -348,8 +343,8 @@
" key=\"video\",\n",
" transform=T.Compose(\n",
" [\n",
" UniformTemporalSubsample(num_frames), \n",
" T.Lambda(lambda x: x / 255.0), \n",
" UniformTemporalSubsample(num_frames),\n",
" T.Lambda(lambda x: x / 255.0),\n",
" ShortSideScale(size=224),\n",
" NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
" TemporalCrop(frames_per_clip=32, stride=40),\n",
Expand Down Expand Up @@ -390,7 +385,7 @@
"outputs": [],
"source": [
"# Load the example video\n",
"video_path = \"dance.mp4\" \n",
"video_path = \"dance.mp4\"\n",
"\n",
"Video.from_file(video_path, width=500)"
]
Expand All @@ -406,7 +401,7 @@
"# We crop the video to a smaller resolution and duration to save RAM\n",
"!ffmpeg -y -ss 0 -i dance.mp4 -filter:v scale=224:-1 -t 1 -v 0 dance_cropped.mp4\n",
"\n",
"video_path = \"dance_cropped.mp4\" "
"video_path = \"dance_cropped.mp4\""
]
},
{
Expand All @@ -430,7 +425,7 @@
"# Move the inputs to the desired device\n",
"video_inputs = video_data[\"video\"]\n",
"\n",
"# Take the first clip \n",
"# Take the first clip\n",
"# The model expects inputs of shape: B x C x T x H x W\n",
"video_input = video_inputs[0][None, ...]"
]
Expand All @@ -452,11 +447,11 @@
},
"outputs": [],
"source": [
"# Pass the input clip through the model \n",
"# Pass the input clip through the model\n",
"with torch.no_grad():\n",
" prediction = model(video_input.to(device), input_type=\"video\")\n",
"\n",
" # Get the predicted classes \n",
" # Get the predicted classes\n",
" pred_classes = prediction.topk(k=5).indices\n",
"\n",
"# Map the predicted classes to the label names\n",
Expand Down Expand Up @@ -531,7 +526,7 @@
" T.Resize(224),\n",
" T.CenterCrop(224),\n",
" T.Normalize(\n",
" mean=[0.485, 0.456, 0.406, 0.0418], \n",
" mean=[0.485, 0.456, 0.406, 0.0418],\n",
" std=[0.229, 0.224, 0.225, 0.0295]\n",
" ),\n",
" ]\n",
Expand Down Expand Up @@ -559,7 +554,7 @@
"source": [
"# Download the example image and disparity file\n",
"!wget -O store.png https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Interior_of_the_IKEA_B%C4%83neasa_33.jpg/791px-Interior_of_the_IKEA_B%C4%83neasa_33.jpg\n",
"!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt "
"!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt"
]
},
{
Expand Down Expand Up @@ -731,8 +726,8 @@
" key=\"video\",\n",
" transform=T.Compose(\n",
" [\n",
" UniformTemporalSubsample(num_frames), \n",
" T.Lambda(lambda x: x / 255.0), \n",
" UniformTemporalSubsample(num_frames),\n",
" T.Lambda(lambda x: x / 255.0),\n",
" ShortSideScale(size=224),\n",
" NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
" TemporalCrop(frames_per_clip=32, stride=40),\n",
Expand Down Expand Up @@ -773,7 +768,7 @@
"outputs": [],
"source": [
"# Load the example video\n",
"video_path = \"epic.mp4\" \n",
"video_path = \"epic.mp4\"\n",
"\n",
"Video.from_file(video_path, width=500)"
]
Expand All @@ -789,7 +784,7 @@
"# We crop the video to a smaller resolution and duration to save RAM\n",
"!ffmpeg -y -ss 0 -i epic.mp4 -filter:v scale=224:-1 -t 1 -v 0 epic_cropped.mp4\n",
"\n",
"video_path = \"epic_cropped.mp4\" "
"video_path = \"epic_cropped.mp4\""
]
},
{
Expand All @@ -812,7 +807,7 @@
"# Move the inputs to the desired device\n",
"video_inputs = video_data[\"video\"]\n",
"\n",
"# Take the first clip \n",
"# Take the first clip\n",
"# The model expects inputs of shape: B x C x T x H x W\n",
"video_input = video_inputs[0][None, ...]"
]
Expand All @@ -834,11 +829,11 @@
},
"outputs": [],
"source": [
"# Pass the input clip through the model \n",
"# Pass the input clip through the model\n",
"with torch.no_grad():\n",
" prediction = model(video_input.to(device), input_type=\"video\")\n",
"\n",
" # Get the predicted classes \n",
" # Get the predicted classes\n",
" pred_classes = prediction.topk(k=5).indices\n",
"\n",
"# Map the predicted classes to the label names\n",
Expand Down
4 changes: 2 additions & 2 deletions omnimae/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ If this work is helpful in your research, please consider starring :star: us and
```

## Contributing
We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information.
We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information.

## License
OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details).
OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details).

4 changes: 2 additions & 2 deletions omnivore/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ If this work is helpful in your research, please consider starring :star: us and
```

## Contributing
We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information.
We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information.

## License
Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details).
Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details).