Reddit Stash Scraper #3141
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Reddit Stash Scraper | |
| on: | |
| schedule: | |
| # TIMEZONE INFORMATION: | |
| # - GitHub Actions uses UTC time for all cron schedules | |
| # - CET is UTC+1 in winter and UTC+2 (CEST) in summer | |
| # - To convert from your local time to UTC: subtract your UTC offset (e.g., for CEST: 14:00 CEST - 2 hours = 12:00 UTC) | |
| # - To convert from UTC to your local time: add your UTC offset (e.g., for CEST: 12:00 UTC + 2 hours = 14:00 CEST) | |
| # | |
| # ADJUSTING FOR YOUR TIMEZONE: | |
| # 1. Determine your timezone's UTC offset (examples: US Eastern = UTC-4/5, Japan = UTC+9, UK = UTC+0/1) | |
| # 2. For desired local time X, set the UTC time to: X - (your UTC offset) | |
| # 3. Update both cron expressions below with your calculated UTC times | |
| # Higher frequency during peak hours (8:00-23:00 CEST = 6:00-21:00 UTC) | |
| - cron: "0 6-21/2 * * *" # Every 2 hours during peak hours (converts to 8:00-23:00 CET time in summer) | |
| # Lower frequency during off-peak hours | |
| - cron: "0 23,3 * * *" # Twice during off-peak (converts to 1:00 and 5:00 CET time in summer) | |
| workflow_dispatch: # Manual trigger option | |
| # Prevent multiple workflow runs executing simultaneously | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| run-script: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 # Kill job if it runs longer than 3 hours (needed for extensive media downloads and recovery cascade) | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| timeout-minutes: 1 | |
| - name: TruffleHog OSS | |
| uses: trufflesecurity/trufflehog@main | |
| with: | |
| extra_args: --only-verified | |
| # Create 6-month refresh key for dependency cache invalidation | |
| - name: Create semi-annual cache key | |
| id: cache-key | |
| run: | | |
| # Get current year and determine which half of the year we're in (1 or 2) | |
| year=$(date +'%Y') | |
| month=$(date +'%-m') | |
| half_year=$((month <= 6 ? 1 : 2)) | |
| echo "date=${year}-H${half_year}" >> $GITHUB_OUTPUT | |
| echo "Using cache key: ${year}-H${half_year}" | |
| # Set up Python with built-in caching | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.12' | |
| cache: 'pip' | |
| cache-dependency-path: requirements.txt | |
| # Install dependencies with pip | |
| - name: Install dependencies | |
| env: | |
| PIP_DISABLE_PIP_VERSION_CHECK: 1 | |
| PIP_NO_WARN_SCRIPT_LOCATION: 1 | |
| PIP_ROOT_USER_ACTION: ignore | |
| # This forces cache invalidation every 6 months | |
| SEMI_ANNUAL_REFRESH: ${{ steps.cache-key.outputs.date }} | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| # Download only the log file from Dropbox | |
| - name: Download log file from Dropbox | |
| timeout-minutes: 2 | |
| env: | |
| DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
| DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
| DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
| run: | | |
| python dropbox_utils.py --download # downloads only the log file | |
| # Process Reddit data | |
| - name: Run Reddit Stash Script | |
| timeout-minutes: 90 # Generous timeout for processing large collections, extensive media downloads, and recovery cascade operations | |
| env: | |
| REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }} | |
| REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }} | |
| REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }} | |
| REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }} | |
| run: | | |
| python reddit_stash.py # process the files | |
| # Upload any changed files to Dropbox | |
| - name: Upload updated data to Dropbox | |
| timeout-minutes: 50 # Extended timeout for uploading large volumes of media files and markdown content | |
| env: | |
| DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
| DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
| DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
| run: | | |
| python dropbox_utils.py --upload # uploads only changed files |