An open-source Ruby library for the Reducto REST API. Provides convenient access to document processing features with both synchronous and asynchronous support.
Add this line to your application's Gemfile:
gem 'reducto'And then execute:
bundle installOr install it yourself as:
gem install reductoThe library needs to be configured with your account's API key, which is available in your Reducto Dashboard.
Set REDUCTO_API_KEY in your environment:
export REDUCTO_API_KEY='your-api-key-here'require 'reducto'
# Initialize the client
client = Reducto.new(api_key: ENV['REDUCTO_API_KEY'])
# Parse a document
response = client.parse.run(input: 'https://example.com/document.pdf')
puts response.job_id
puts response.usage.pagesrequire 'reducto'
client = Reducto.new
# Upload a local file
upload = client.upload(file: File.open('sample.pdf'))
# Parse the uploaded file
result = client.parse.run(input: upload.url)
puts resultExtract structured data from documents using JSON schemas:
# Define extraction schema
schema = {
type: 'object',
properties: {
customer_name: {
type: 'string',
description: 'The full name of the customer'
},
accounts: {
type: 'array',
description: 'List of financial accounts',
items: {
type: 'object',
properties: {
account_type: { type: 'string' },
account_number: { type: 'string' },
ending_value: { type: 'number' }
},
required: ['account_type', 'account_number', 'ending_value']
}
}
},
required: ['customer_name', 'accounts']
}
# Extract data with citations
result = client.extract.run(
input: upload.url,
instructions: {
schema: schema,
system_prompt: 'Be precise and thorough.'
},
settings: {
citations: {
enabled: true
}
}
)
# Access extracted data
puts "Customer: #{result['result']['customer_name']['value']}"
result['result']['accounts']['value'].each do |account|
puts " #{account['account_type']['value']}: $#{account['ending_value']['value']}"
end# Upload a document
upload = client.upload(file: File.open('contract.docx'))
# Fill in form fields
result = client.edit.run(
document_url: upload.url,
edit_instructions: "Fill in the client name as 'Acme Corporation' and set the contract date to January 15, 2024",
edit_options: {
color: '#0066CC'
}
)
# Download the edited document
edited_url = result['document_url']For large documents, use async processing:
# Submit async parse job
job_response = client.parse.run_job(
input: 'https://example.com/large-document.pdf',
enhance: {
summarize_figures: true
}
)
job_id = job_response['job_id']
puts "Job ID: #{job_id}"
# Poll for completion
loop do
job = client.job.get(job_id)
status = job['status']
break if status == 'Completed' || status == 'Failed'
puts "Status: #{status}"
sleep 2
end
# Get final result
final_job = client.job.get(job_id)
if final_job['status'] == 'Completed'
result = final_job['result']
puts "Processing complete!"
else
puts "Job failed: #{final_job['error']}"
endChain operations efficiently by reusing parsed results:
# Step 1: Parse the document
parse_response = client.parse.run(input: document_url)
job_id = parse_response.job_id
# Step 2: Classify document type (reuses parsed data)
classification = client.extract.run(
input: "jobid://#{job_id}", # Reference the parsed job
instructions: {
schema: {
type: 'object',
properties: {
document_type: {
type: 'string',
enum: ['W2', 'Passport', 'Invoice', 'Other']
}
}
}
}
)
document_type = classification['result']['document_type']['value']
# Step 3: Extract with type-specific schema
schema = case document_type
when 'W2'
{ type: 'object', properties: { total_wages: { type: 'number' }, ... } }
when 'Invoice'
{ type: 'object', properties: { total_amount: { type: 'number' }, ... } }
end
# Extract using the same parsed job (saves processing time and credits)
extract_response = client.extract.run(
input: "jobid://#{job_id}",
instructions: { schema: schema }
)Configure webhooks to receive notifications when jobs complete:
# Step 1: Configure webhook (one-time setup)
webhook_config = client.webhook.run
puts "Configure webhook at: #{webhook_config['url']}"
# Step 2: Submit jobs with webhook
result = client.parse.run_job(
input: 'https://example.com/document.pdf',
async: {
webhook: {
mode: 'svix', # Managed webhook delivery
channels: [] # Or specify specific channels
}
}
)
# Direct webhook (simpler, no Svix required)
result = client.parse.run_job(
input: 'https://example.com/document.pdf',
async: {
webhook: {
mode: 'direct',
url: 'https://your-app.com/webhooks/reducto'
}
}
)Process multiple documents concurrently (requires Ruby >= 3.1):
require 'reducto/async_client'
require 'async'
client = Reducto::AsyncClient.new
# Process documents concurrently
documents = Dir.glob('docs/**/*.pdf')[0...100]
max_concurrency = 50
Async do |task|
semaphore = Async::Semaphore.new(max_concurrency)
results = documents.map do |path|
task.async do
semaphore.async do
# Upload and parse
upload = client.upload(file: File.open(path)).wait
result = client.parse.run(input: upload.url).wait
# Save result
output_path = path.sub('.pdf', '.reducto.json')
File.write(output_path, result.to_json)
{ path: path, chunks: result.result.chunks.length }
end
end
end
completed = results.map(&:wait)
puts "Processed #{completed.length} documents"
end
client.close# Production (default)
client = Reducto.new(environment: 'production')
# EU
client = Reducto.new(environment: 'eu')
# Australia
client = Reducto.new(environment: 'au')
# Custom base URL
client = Reducto.new(base_url: 'https://custom.reducto.ai')# Default timeout is 1 hour (3600 seconds)
client = Reducto.new(timeout: 120) # 2 minutes# Default max retries is 2
client = Reducto.new(max_retries: 5)Access raw HTTP responses including headers:
response = client.parse.run(
input: 'https://example.com/document.pdf',
raw_response: true
)
puts response.status
puts response.headers
parsed_data = response.parse # Get the parsed objectStream response data for server-sent events:
stream = client.parse.run(
input: 'https://example.com/document.pdf',
streaming: true
)
stream.each do |event|
puts "Event: #{event.data}"
end
stream.closeResponse objects are typed using BaseModel (similar to Pydantic):
response = client.parse.run(input: 'document.pdf')
# Access typed fields
response.job_id # String
response.duration # Float
response.usage.pages # Integer
response.result.chunks # Array
# Serialize to JSON
response.to_json
# Convert to hash
response.to_hash
# Check which fields are set
response.model_fields_set # Returns Set of field names
# Access extra/undocumented fields
response.model_extra # Returns Hash of extra fieldsif response.pdf_url.nil?
if response.model_fields_set.include?(:pdf_url)
puts 'pdf_url was explicitly null'
else
puts 'pdf_url was not present in the response'
end
endThe library raises exceptions for API errors:
begin
response = client.parse.run(input: 'invalid-url')
rescue Reducto::AuthenticationError => e
puts "Authentication failed: #{e.message}"
rescue Reducto::RateLimitError => e
puts "Rate limit exceeded: #{e.message}"
rescue Reducto::BadRequestError => e
puts "Bad request: #{e.message}"
rescue Reducto::APIConnectionError => e
puts "Connection error: #{e.message}"
rescue Reducto::APITimeoutError => e
puts "Request timed out: #{e.message}"
rescue Reducto::APIStatusError => e
puts "API error (#{e.status_code}): #{e.message}"
endReducto::Error- Base error classReducto::APIConnectionError- Network/connection errorsReducto::APITimeoutError- Timeout errors
Reducto::APIStatusError- HTTP status errorsReducto::BadRequestError(400)Reducto::AuthenticationError(401)Reducto::PermissionDeniedError(403)Reducto::NotFoundError(404)Reducto::ConflictError(409)Reducto::UnprocessableEntityError(422)Reducto::RateLimitError(429)Reducto::InternalServerError(5xx)
Parse documents into structured chunks:
# Synchronous parsing
response = client.parse.run(
input: 'https://example.com/document.pdf',
enhance: {
summarize_figures: true
},
formatting: {
markdown: true
},
retrieval: {
embedding_optimized: true
}
)
# Asynchronous parsing
job = client.parse.run_job(input: document_url, async: { webhook: {...} })Extract structured data with schemas:
result = client.extract.run(
input: document_url,
instructions: {
schema: {...},
system_prompt: 'Be precise'
},
settings: {
citations: { enabled: true }
}
)Fill forms and edit documents:
result = client.edit.run(
document_url: document_url,
edit_instructions: 'Fill in the form fields',
edit_options: { color: '#0000FF' }
)Split documents into sections:
result = client.split.run(
input: document_url,
split_description: [
{ name: 'section1', description: 'First section' }
],
split_rules: 'Split by major sections'
)Run custom pipelines:
result = client.pipeline.run(
input: document_url,
pipeline_id: 'your-pipeline-id'
)# Get job status
job = client.job.get(job_id)
# List all jobs
jobs = client.job.get_all(limit: 10, cursor: 'next_page_token')
# Cancel a job
client.job.cancel(job_id)# Configure webhook
webhook = client.webhook.run# Upload a file
upload = client.upload(file: File.open('document.pdf'), extension: 'pdf')
# Get API version
version = client.api_versionSee the examples/ directory for complete working examples:
- basic_usage.rb - Basic parsing and uploading
- structured_extraction.rb - JSON schema extraction
- document_editing.rb - Form filling and editing
- multi_step_workflow.rb - Chained operations
- job_polling.rb - Async job polling patterns
- webhook_setup.rb - Webhook configuration
- batch_processing.rb - Concurrent processing
- async_usage.rb - AsyncClient examples
- response_wrappers.rb - Raw and streaming responses
- Ruby >= 2.7.0
- For async functionality (AsyncClient): Ruby >= 3.1.0 and
async,async-httpgems
After checking out the repo, run bundle install to install dependencies. Then, run bundle exec rspec to run the tests.
To install this gem onto your local machine, run bundle exec rake install.
Bug reports and pull requests are welcome on GitHub at https://github.com/databodega-io/reducto-ruby-sdk.
The gem is available as open source under the terms of the Apache-2.0 License.
This Ruby SDK is inspired by the official Reducto Python SDK.