0% found this document useful (0 votes)
65 views3 pages

Scribd Document Downloader Script

This Python script extracts text and images from Scribd documents by parsing the document HTML, finding JSON API endpoints, and saving the content in text files or image files based on whether the document contains text or images. It takes a Scribd document URL and flag as arguments, gets the document title, sanitizes it for the file system, then loops through the JavaScript to find JSON endpoints, calls functions to save either text or images to files with the title and page number.

Uploaded by

afjfasjjda
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
65 views3 pages

Scribd Document Downloader Script

This Python script extracts text and images from Scribd documents by parsing the document HTML, finding JSON API endpoints, and saving the content in text files or image files based on whether the document contains text or images. It takes a Scribd document URL and flag as arguments, gets the document title, sanitizes it for the file system, then loops through the JavaScript to find JSON endpoints, calls functions to save either text or images to files with the title and page number.

Uploaded by

afjfasjjda
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

#!

/usr/bin/env python

from bs4 import BeautifulSoup


import requests
import shutil
import sys
import argparse

def get_arguments():
parser = argparse.ArgumentParser(
description='A Scribd-Downloader that actually works')

parser.add_argument(
'doc',
metavar='DOC',
type=str,
help='scribd document to download')
parser.add_argument(
'-i',
'--images',
help="download document made up of images",
action='store_true',
default=False)

return parser.parse_args()

# fix encoding issues in python2


def fix_encoding(query):
if sys.version_info > (3, 0):
return query
else:
return query.encode('utf-8')

def save_image(jsonp, imagename):


replacement = jsonp.replace('/pages/', '/images/').replace('jsonp', 'jpg')
response = requests.get(replacement, stream=True)

with open(imagename, 'wb') as out_file:


shutil.copyfileobj(response.raw, out_file)

def save_text(jsonp, filename):


response = requests.get(url=jsonp).text
page_no = response[11:12]

response_head = (
response).replace('window.page' + page_no + '_callback(["',
'').replace('\\n', '').replace('\\', '').replace(
'"]);', '')
soup_content = BeautifulSoup(response_head, 'html.parser')

for x in soup_content.find_all('span', {'class': 'a'}):


xtext = fix_encoding(x.get_text())
print(xtext)

extraction = xtext + '\n'


with open(filename, 'a') as feed:
feed.write(extraction)

# detect image and text


def save_content(jsonp, images, train, title):
if not jsonp == '':
if images:
imagename = title + '_' + str(train) + '.jpg'
print('Downloading image to ' + imagename)
save_image(jsonp, imagename)
else:
save_text(jsonp, (title + '.txt'))
train += 1

return train

def sanitize_title(title):
'''Remove forbidden characters from title that will prevent OS from creating
directory. (For Windows at least.)
Also change ' ' to '_' to preserve previous behavior.'''

forbidden_chars = " *\"/\<>:|"


replace_char = "_"

for ch in forbidden_chars:
title = title.replace(ch, replace_char)

return title

# the main function


def get_scribd_document(url, images):
response = requests.get(url=url).text
soup = BeautifulSoup(response, 'html.parser')

title = soup.find('title').get_text()#.replace(' ', '_')


title = sanitize_title(title) # a bit more thorough

if not images:
print('Extracting text to ' + title + '.txt\n')

print(title + '\n')

js_text = soup.find_all('script', type='text/javascript')


train = 1

for opening in js_text:

for inner_opening in opening:


portion1 = inner_opening.find('https://')

if not portion1 == -1:


portion2 = inner_opening.find('.jsonp')
jsonp = inner_opening[portion1:portion2+6]

train = save_content(jsonp, images, train, title)


def command_line():
args = get_arguments()
url = args.doc
images = args.images
get_scribd_document(url, images)

if __name__ == '__main__':

command_line()

You might also like