Wordlist Collection #1591
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Wordlist Collection | |
| # Controls when the action will run. | |
| on: | |
| schedule: | |
| - cron: "15 13 * * *" # At 13:15 UTC everyday. | |
| workflow_dispatch: # Allow manual trigger | |
| # A workflow run is made up of one or more jobs that can run sequentially or in parallel | |
| jobs: | |
| # This workflow contains a single job called "wordlist" | |
| wordlist: | |
| # The type of runner that the job will run on | |
| runs-on: ubuntu-latest | |
| # Steps represent a sequence of tasks that will be executed as part of the job | |
| steps: | |
| # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it | |
| - uses: actions/checkout@v2 | |
| - name: Update & install dependencies | |
| run: | | |
| echo "Installing required packages..." | |
| sudo apt update && sudo apt-get install libxml2-utils curl jq -y | |
| echo "Available disk space:" | |
| df -h | |
| echo "Available memory:" | |
| free -h | |
| - name: Fetch Google Trends data | |
| run: | | |
| echo "Fetching Google Trends data..." | |
| # Function to fetch and validate RSS feed | |
| fetch_trends() { | |
| local country=$1 | |
| local output_file=$2 | |
| local retry_count=0 | |
| local max_retries=3 | |
| while [ $retry_count -lt $max_retries ]; do | |
| echo "Fetching trends for $country (attempt $((retry_count + 1)))" | |
| # Fetch from new RSS feed | |
| curl -s -A "Mozilla/5.0" --retry 3 --retry-delay 2 \ | |
| "https://trends.google.com/trending/rss?geo=$country" | \ | |
| xmllint --xpath "//item/title" - 2>/dev/null | \ | |
| sed -e 's/<[^>]*>//g' > "$output_file" | |
| # Validate content | |
| if [ -s "$output_file" ]; then | |
| echo "Successfully fetched trends for $country" | |
| break | |
| fi | |
| retry_count=$((retry_count + 1)) | |
| if [ $retry_count -lt $max_retries ]; then | |
| echo "Retrying in 5 seconds..." | |
| sleep 5 | |
| fi | |
| done | |
| if [ ! -s "$output_file" ]; then | |
| echo "Warning: Failed to fetch trends for $country after $max_retries attempts" | |
| echo "No trends available" > "$output_file" | |
| fi | |
| } | |
| # Fetch trends for each country | |
| fetch_trends "FR" "news_FR" | |
| sleep 2 # Rate limiting | |
| fetch_trends "BE" "news_BE" | |
| sleep 2 # Rate limiting | |
| fetch_trends "CH" "news_CH" | |
| echo "Current directory contents:" | |
| ls -la | |
| echo "File sizes:" | |
| wc -l news_* | |
| - name: Process and transform data | |
| run: | | |
| echo "Processing news data..." | |
| # Function to process text files | |
| process_file() { | |
| local input=$1 | |
| local base=${input%.*} | |
| echo "Processing $input..." | |
| # Create variations | |
| awk '{print tolower($0)}' "$input" > "${base}_lower" | |
| awk '{print toupper($0)}' "$input" > "${base}_upper" | |
| sed "s/ //g" "$input" > "${base}_nospaces" | |
| sed "s/-//g" "$input" > "${base}_nodash" | |
| sed "s/'//g" "$input" > "${base}_quote" | |
| # Handle accents | |
| iconv -f UTF-8 -t ASCII//TRANSLIT "$input" "${base}_lower" "${base}_quote" "${base}_upper" "${base}_nodash" "${base}_nospaces" > "${base}_noaccent" 2>/dev/null || echo "Warning: Some accent conversions failed" | |
| } | |
| # Process each country's news | |
| for country in FR BE CH; do | |
| process_file "news_$country" | |
| done | |
| echo "Current directory contents:" | |
| ls -la | |
| echo "File sizes:" | |
| wc -l news_* | |
| - name: Combine and deduplicate | |
| run: | | |
| echo "Combining and deduplicating news..." | |
| cat news* >> pourquoi | |
| LC_ALL=C sort -u pourquoi > news | |
| echo "Final file statistics:" | |
| ls -la | |
| wc -l news* | |
| # Cleanup temporary files | |
| rm -f *_lower *_upper *_nospaces *_nodash *_quote *_noaccent pourquoi | |
| # Runs a set of commands using the runners shell | |
| - name: Git commit and push | |
| run: | | |
| echo "Committing changes..." | |
| git config user.name "News-bot" | |
| git config user.email "" | |
| git add news | |
| git commit -m "[Bot] News from $(date +'%Y_%m_%d')" | |
| git push origin master |