0% found this document useful (0 votes)

65 views3 pages

Scribd Document Downloader Script

This Python script extracts text and images from Scribd documents by parsing the document HTML, finding JSON API endpoints, and saving the content in text files or image files based on whether the document contains text or images. It takes a Scribd document URL and flag as arguments, gets the document title, sanitizes it for the file system, then loops through the JavaScript to find JSON endpoints, calls functions to save either text or images to files with the title and page number.

Uploaded by

afjfasjjda

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

65 views3 pages

Scribd Document Downloader Script

Uploaded by

afjfasjjda

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 3

#!

/usr/bin/env python

from bs4 import BeautifulSoup

import requests
import shutil
import sys
import argparse

def get_arguments():
parser = argparse.ArgumentParser(
description='A Scribd-Downloader that actually works')

parser.add_argument(
'doc',
metavar='DOC',
type=str,
help='scribd document to download')
parser.add_argument(
'-i',
'--images',
help="download document made up of images",
action='store_true',
default=False)

return parser.parse_args()

# fix encoding issues in python2

def fix_encoding(query):
if sys.version_info > (3, 0):
return query
else:
return query.encode('utf-8')

def save_image(jsonp, imagename):

replacement = jsonp.replace('/pages/', '/images/').replace('jsonp', 'jpg')
response = requests.get(replacement, stream=True)

with open(imagename, 'wb') as out_file:

shutil.copyfileobj(response.raw, out_file)

def save_text(jsonp, filename):

response = requests.get(url=jsonp).text
page_no = response[11:12]

response_head = (
response).replace('window.page' + page_no + '_callback(["',
'').replace('\\n', '').replace('\\', '').replace(
'"]);', '')
soup_content = BeautifulSoup(response_head, 'html.parser')

for x in soup_content.find_all('span', {'class': 'a'}):

xtext = fix_encoding(x.get_text())
print(xtext)

extraction = xtext + '\n'

with open(filename, 'a') as feed:
feed.write(extraction)

# detect image and text

def save_content(jsonp, images, train, title):
if not jsonp == '':
if images:
imagename = title + '_' + str(train) + '.jpg'
print('Downloading image to ' + imagename)
save_image(jsonp, imagename)
else:
save_text(jsonp, (title + '.txt'))
train += 1

return train

def sanitize_title(title):
'''Remove forbidden characters from title that will prevent OS from creating
directory. (For Windows at least.)
Also change ' ' to '_' to preserve previous behavior.'''

forbidden_chars = " *\"/\<>:|"

replace_char = "_"

for ch in forbidden_chars:
title = title.replace(ch, replace_char)

return title

# the main function

def get_scribd_document(url, images):
response = requests.get(url=url).text
soup = BeautifulSoup(response, 'html.parser')

title = soup.find('title').get_text()#.replace(' ', '_')

title = sanitize_title(title) # a bit more thorough

if not images:
print('Extracting text to ' + title + '.txt\n')

print(title + '\n')

js_text = soup.find_all('script', type='text/javascript')

train = 1

for opening in js_text:

for inner_opening in opening:

portion1 = inner_opening.find('https://')

if not portion1 == -1:

portion2 = inner_opening.find('.jsonp')
jsonp = inner_opening[portion1:portion2+6]

train = save_content(jsonp, images, train, title)

def command_line():
args = get_arguments()
url = args.doc
images = args.images
get_scribd_document(url, images)

if __name__ == '__main__':

command_line()

Create - Folder - If - Not - Exists: STR None
No ratings yet
Create - Folder - If - Not - Exists: STR None
5 pages
Parser
No ratings yet
Parser
6 pages
WebUkdkddknblocker MD
No ratings yet
WebUkdkddknblocker MD
4 pages
Tool
No ratings yet
Tool
3 pages
Concurrent and Network Lab-1
No ratings yet
Concurrent and Network Lab-1
8 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
Agoravai
No ratings yet
Agoravai
3 pages
YouTube DownLoader
No ratings yet
YouTube DownLoader
20 pages
Agent 301
No ratings yet
Agent 301
5 pages
Atacco Dos - Py
No ratings yet
Atacco Dos - Py
2 pages
Practical7 IR
No ratings yet
Practical7 IR
3 pages
Colab Setup for Roop-Unleashed
No ratings yet
Colab Setup for Roop-Unleashed
4 pages
Web Scraping & Inverted Index Guide
No ratings yet
Web Scraping & Inverted Index Guide
10 pages
HTTP Request Methods Guide
No ratings yet
HTTP Request Methods Guide
3 pages
Find Novel
No ratings yet
Find Novel
2 pages
The Economist News Script
No ratings yet
The Economist News Script
7 pages
Exp 2a
No ratings yet
Exp 2a
3 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
Aaa
No ratings yet
Aaa
30 pages
25 Awesome Python Scripts
No ratings yet
25 Awesome Python Scripts
26 pages
Deep Fake For Free - Ipynb
No ratings yet
Deep Fake For Free - Ipynb
5 pages
Department of Computer Science & Engineering ST Joseph Engineering College, Mangaluru-575028 2020-2021
No ratings yet
Department of Computer Science & Engineering ST Joseph Engineering College, Mangaluru-575028 2020-2021
11 pages
Ex 7
No ratings yet
Ex 7
2 pages
DPVP Projects (1,4,7)
No ratings yet
DPVP Projects (1,4,7)
4 pages
Seila
No ratings yet
Seila
6 pages
Subdomain Scanner Script
No ratings yet
Subdomain Scanner Script
2 pages
Proxy Load Testing Script Guide
No ratings yet
Proxy Load Testing Script Guide
2 pages
Findasskasjd
No ratings yet
Findasskasjd
5 pages
Code
No ratings yet
Code
18 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Netflix Account Checker Script
No ratings yet
Netflix Account Checker Script
5 pages
FTP
No ratings yet
FTP
2 pages
CSC Genration Python
No ratings yet
CSC Genration Python
3 pages
Image To Doc Working
No ratings yet
Image To Doc Working
4 pages
Browser Use Tool
No ratings yet
Browser Use Tool
2 pages
Safeum Account
No ratings yet
Safeum Account
3 pages
YouTube View Bot Script Guide
No ratings yet
YouTube View Bot Script Guide
2 pages
Utils
No ratings yet
Utils
4 pages
Lecture 12 - Web Scrapping
No ratings yet
Lecture 12 - Web Scrapping
11 pages
GFDTDG
No ratings yet
GFDTDG
4 pages
Python Scripts for Developers
100% (2)
Python Scripts for Developers
65 pages
Cod
No ratings yet
Cod
181 pages
IndicTrans2 PDF to Punjabi Docx Conversion
No ratings yet
IndicTrans2 PDF to Punjabi Docx Conversion
5 pages
Roop-Unleashed4.1.1 .Ipynb
No ratings yet
Roop-Unleashed4.1.1 .Ipynb
2 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
视频下载与解析指南
No ratings yet
视频下载与解析指南
2 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Arch
No ratings yet
Arch
28 pages
Nekobin
No ratings yet
Nekobin
2 pages
1 Notmnist - Ipynb
No ratings yet
1 Notmnist - Ipynb
15 pages
Python Unit 4
No ratings yet
Python Unit 4
6 pages
Main
No ratings yet
Main
3 pages
Prompt Scrapping
No ratings yet
Prompt Scrapping
1 page
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
README MD
No ratings yet
README MD
3 pages
Message 12 3
No ratings yet
Message 12 3
10 pages
Crawl Manga in Python
No ratings yet
Crawl Manga in Python
2 pages
Method Statement For Cable Pulling
100% (1)
Method Statement For Cable Pulling
6 pages
TECH MAN TR-4315v.M
No ratings yet
TECH MAN TR-4315v.M
85 pages
Silent Hunter 5 Instruction Manual (English)
56% (9)
Silent Hunter 5 Instruction Manual (English)
35 pages
AC 420 Syllabus Spring 2022
No ratings yet
AC 420 Syllabus Spring 2022
6 pages
Octave: Senior Management Briefing
No ratings yet
Octave: Senior Management Briefing
21 pages
Cetrifugal Barrel Catalogue
No ratings yet
Cetrifugal Barrel Catalogue
8 pages
Drug Therapy Monitoring
100% (1)
Drug Therapy Monitoring
13 pages
SAP-BODS Integration Using IDOCS
No ratings yet
SAP-BODS Integration Using IDOCS
7 pages
Scom - CS74 76CS56 683 - Rehs4671 00
No ratings yet
Scom - CS74 76CS56 683 - Rehs4671 00
52 pages
RSC (Resort Software Community) Building: THAD SUP Atelier - ArchDaily
No ratings yet
RSC (Resort Software Community) Building: THAD SUP Atelier - ArchDaily
15 pages
Dairy Farm Project Report - Crossbred Cow (Large Scale)
100% (1)
Dairy Farm Project Report - Crossbred Cow (Large Scale)
2 pages
10jan KTM Mct-2pax.
No ratings yet
10jan KTM Mct-2pax.
1 page
Evidence Tagging Guide for Admins
No ratings yet
Evidence Tagging Guide for Admins
5 pages
Chapter 17 Achieving Quality Production
No ratings yet
Chapter 17 Achieving Quality Production
2 pages
GEOG 106 Topic 1
No ratings yet
GEOG 106 Topic 1
3 pages
Hdfs 3110 Final Resume
No ratings yet
Hdfs 3110 Final Resume
1 page
Machine Maintenance Guide
No ratings yet
Machine Maintenance Guide
23 pages
Re-Thinking The 4P's
No ratings yet
Re-Thinking The 4P's
32 pages
Stanford Ponzi Scheme: Nanes Arrested
No ratings yet
Stanford Ponzi Scheme: Nanes Arrested
6 pages
OpenMP Shared Memory Programming Guide
No ratings yet
OpenMP Shared Memory Programming Guide
65 pages
Assignment 2
No ratings yet
Assignment 2
5 pages
VSP-6.3-GS-M-02 Manual
No ratings yet
VSP-6.3-GS-M-02 Manual
30 pages
SimplePractice CBT Time Management Worksheets
No ratings yet
SimplePractice CBT Time Management Worksheets
4 pages
HIC Student Handbook Rev2023 2024 041820 8
No ratings yet
HIC Student Handbook Rev2023 2024 041820 8
99 pages
Vehicle Checklist
No ratings yet
Vehicle Checklist
9 pages
Final Project
No ratings yet
Final Project
20 pages
First Data Annual Report 2008
No ratings yet
First Data Annual Report 2008
417 pages
DBExtract Manual
100% (1)
DBExtract Manual
38 pages
NWP Rockwool Filled Wall Panel-Tds
No ratings yet
NWP Rockwool Filled Wall Panel-Tds
2 pages
Referencing Using The Documentary-Note (Oxford) System, Deakin University
No ratings yet
Referencing Using The Documentary-Note (Oxford) System, Deakin University
9 pages

Scribd Document Downloader Script

Uploaded by

Scribd Document Downloader Script

Uploaded by

#!

from bs4 import BeautifulSoup

# fix encoding issues in python2

def save_image(jsonp, imagename):

with open(imagename, 'wb') as out_file:

def save_text(jsonp, filename):

for x in soup_content.find_all('span', {'class': 'a'}):

extraction = xtext + '\n'

# detect image and text

forbidden_chars = " *\"/\<>:|"

# the main function

title = soup.find('title').get_text()#.replace(' ', '_')

js_text = soup.find_all('script', type='text/javascript')

for opening in js_text:

for inner_opening in opening:

if not portion1 == -1:

train = save_content(jsonp, images, train, title)

You might also like