bolivar

Fast PDF text and table extraction. Written in Rust, drop-in compatible with pdfminer and pdfplumber.

Install

pip install bolivar

implementation("sa.ingenious:bolivar:1.2.0")

sa.ingenious/bolivar {:mvn/version "1.2.0"}

[dependencies]
bolivar-core = "1.2"

Extract text

Pull all text from a PDF in one call. The pdfplumber interface opens the file and iterates pages; the pdfminer interface returns the full text directly. JVM and Rust APIs follow the same pattern with their respective conventions.

import pdfplumber

with pdfplumber.open("doc.pdf") as pdf:
    for page in pdf.pages:
        print(page.extract_text())

from pdfminer.high_level import extract_text

text = extract_text("doc.pdf")

import sa.ingenious.pdf.Document;
import sa.ingenious.pdf.DocumentOptions;

var options = DocumentOptions.builder()
    .maxPages(1)
    .layout(layout -> layout.lineMargin(0.5).wordMargin(0.1))
    .build();

String text = Document.extractText("doc.pdf", options);

import sa.ingenious.pdf.extractText

val text = extractText("doc.pdf") {
    maxPages = 1
    layout {
        lineMargin = 0.5
        wordMargin = 0.1
    }
}

(require '[sa.ingenious.pdf :as pdf])

(def text (pdf/extract-text "doc.pdf"))

use bolivar_core::high_level::extract_text;

fn main() -> bolivar_core::Result<()> {
    let data = std::fs::read("doc.pdf")?;
    let text = extract_text(&data, None)?;
    println!("{text}");
    Ok(())
}

Extract tables

Detect and extract tabular data from each page. Bolivar returns structured tables with row and column counts, bounding boxes, and cell text so you can inspect or export them without manual parsing.

import pdfplumber

with pdfplumber.open("doc.pdf") as pdf:
    for page in pdf.pages:
        for table in page.extract_tables():
            print(table)

import sa.ingenious.pdf.Document;
import sa.ingenious.pdf.DocumentOptions;

var options = DocumentOptions.builder().pages(1, 2).build();
try (Document doc = Document.open("doc.pdf", options)) {
    for (var table : doc.extractTables()) {
        System.out.println(table.rowCount() + "x" + table.columnCount());
    }
}

import sa.ingenious.pdf.openDocument

val doc = openDocument("doc.pdf") {
    pages(1, 2)
}
doc.use {
    for (table in it.extractTables()) {
        println("${table.rowCount}x${table.columnCount}")
    }
}

(require '[sa.ingenious.pdf :as pdf])

(with-open [doc (pdf/open "doc.pdf" {:pages [1 2]})]
  (doseq [table (pdf/tables doc)]
    (println (:row-count table) "x" (:column-count table))))

use bolivar_core::high_level::{extract_tables_with_document, ExtractOptions};
use bolivar_core::pdfdocument::PDFDocument;
use bolivar_core::table::TableSettings;

fn main() -> bolivar_core::Result<()> {
    let data = std::fs::read("doc.pdf")?;
    let doc = PDFDocument::new(&data, "")?;
    let tables = extract_tables_with_document(
        &doc,
        ExtractOptions::default(),
        &TableSettings::default(),
    )?;
    Ok(())
}

Iterate pages

Walk through pages one at a time to read metadata like page number, dimensions, and a text preview. This is useful when you need to locate content across a large document before extracting specific pages.

import pdfplumber

with pdfplumber.open("doc.pdf") as pdf:
    for page in pdf.pages:
        print(page.page_number, page.width, page.height)

from pdfminer.high_level import extract_pages

for page in extract_pages("doc.pdf"):
    print(page.pageid, page.width, page.height)

import sa.ingenious.pdf.Document;
import sa.ingenious.pdf.DocumentOptions;

var options = DocumentOptions.builder().maxPages(3).build();
try (Document doc = Document.open("doc.pdf", options)) {
    for (var page : doc.extractPageSummaries()) {
        System.out.println(page.pageNumber() + ": " + page.text().substring(0, Math.min(80, page.text().length())));
    }
}

import sa.ingenious.pdf.openDocument

val doc = openDocument("doc.pdf") {
    maxPages = 3
}
doc.use {
    for (page in it.extractPageSummaries()) {
        println("${page.pageNumber}: ${page.text.take(80)}")
    }
}

(require '[sa.ingenious.pdf :as pdf])

(with-open [doc (pdf/open "doc.pdf" {:max-pages 3})]
  (doseq [page (pdf/page-summaries doc)]
    (println (:page-number page) (subs (:text page) 0 (min 80 (count (:text page)))))))

use bolivar_core::high_level::extract_pages;

fn main() -> bolivar_core::Result<()> {
    let data = std::fs::read("doc.pdf")?;
    for page in extract_pages(&data, None)? {
        let page = page?;
        println!("{}", page.pageid);
    }
    Ok(())
}

Async (Python)

Run extraction off the main thread in Python while keeping the same pdfplumber API.

import pdfplumber

async with pdfplumber.open("doc.pdf") as pdf:
    for page in pdf.pages:
        for table in page.extract_tables():
            print(table)

License

MIT

Name		Name	Last commit message	Last commit date
Latest commit History 354 Commits
.cargo		.cargo
.github		.github
benchmarks		benchmarks
crates		crates
references		references
samples/contrib		samples/contrib
scripts		scripts
tests		tests
.editorconfig		.editorconfig
.gitignore		.gitignore
.gitmodules		.gitmodules
.pre-commit-config.yaml		.pre-commit-config.yaml
.releaserc.yml		.releaserc.yml
CHANGELOG.md		CHANGELOG.md
CONTRIBUTING.md		CONTRIBUTING.md
Cargo.lock		Cargo.lock
Cargo.toml		Cargo.toml
LICENSE		LICENSE
Makefile.toml		Makefile.toml
README.md		README.md
package-lock.json		package-lock.json
package.json		package.json
pyproject.toml		pyproject.toml
rust-toolchain.toml		rust-toolchain.toml
uv.lock		uv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

bolivar

Install

Extract text

Extract tables

Iterate pages

Async (Python)

License

About

Uh oh!

Releases 12

Packages

Uh oh!

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

bolivar

Install

Extract text

Extract tables

Iterate pages

Async (Python)

License

About

Topics

Resources

License

Contributing

Uh oh!

Stars

Watchers

Forks

Releases 12

Packages 0

Uh oh!

Uh oh!

Contributors

Uh oh!

Languages

Packages