Skip to content

Commit c863efc

Browse files
authored
pdf_text: add raw argument for page::raw_order_layout (#138)
1 parent 92076d2 commit c863efc

File tree

6 files changed

+16
-11
lines changed

6 files changed

+16
-11
lines changed

R/RcppExports.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ poppler_pdf_data <- function(x, get_font_info, opw, upw) {
1717
.Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw)
1818
}
1919

20-
poppler_pdf_text <- function(x, opw, upw) {
21-
.Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw)
20+
poppler_pdf_text <- function(x, opw, upw, raw = FALSE) {
21+
.Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw, raw)
2222
}
2323

2424
poppler_pdf_pagesize <- function(x, opw, upw) {

R/tools.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,10 @@ pdf_info <- function(pdf, opw = "", upw = "") {
4141
}
4242

4343
#' @rdname pdftools
44+
#' @param raw if TRUE text is kept in content stream order. Default: FALSE.
4445
#' @export
45-
pdf_text <- function(pdf, opw = "", upw = "") {
46-
poppler_pdf_text(loadfile(pdf), opw, upw)
46+
pdf_text <- function(pdf, opw = "", upw = "", raw = FALSE) {
47+
poppler_pdf_text(loadfile(pdf), opw, upw, raw)
4748
}
4849

4950
#' @rdname pdftools

man/pdftools.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/RcppExports.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,16 @@ BEGIN_RCPP
5959
END_RCPP
6060
}
6161
// poppler_pdf_text
62-
CharacterVector poppler_pdf_text(RawVector x, std::string opw, std::string upw);
63-
RcppExport SEXP _pdftools_poppler_pdf_text(SEXP xSEXP, SEXP opwSEXP, SEXP upwSEXP) {
62+
CharacterVector poppler_pdf_text(RawVector x, std::string opw, std::string upw, bool raw);
63+
RcppExport SEXP _pdftools_poppler_pdf_text(SEXP xSEXP, SEXP opwSEXP, SEXP upwSEXP, SEXP rawSEXP) {
6464
BEGIN_RCPP
6565
Rcpp::RObject rcpp_result_gen;
6666
Rcpp::RNGScope rcpp_rngScope_gen;
6767
Rcpp::traits::input_parameter< RawVector >::type x(xSEXP);
6868
Rcpp::traits::input_parameter< std::string >::type opw(opwSEXP);
6969
Rcpp::traits::input_parameter< std::string >::type upw(upwSEXP);
70-
rcpp_result_gen = Rcpp::wrap(poppler_pdf_text(x, opw, upw));
70+
Rcpp::traits::input_parameter< bool >::type raw(rawSEXP);
71+
rcpp_result_gen = Rcpp::wrap(poppler_pdf_text(x, opw, upw, raw));
7172
return rcpp_result_gen;
7273
END_RCPP
7374
}
@@ -175,7 +176,7 @@ static const R_CallMethodDef CallEntries[] = {
175176
{"_pdftools_get_poppler_config", (DL_FUNC) &_pdftools_get_poppler_config, 0},
176177
{"_pdftools_poppler_pdf_info", (DL_FUNC) &_pdftools_poppler_pdf_info, 3},
177178
{"_pdftools_poppler_pdf_data", (DL_FUNC) &_pdftools_poppler_pdf_data, 4},
178-
{"_pdftools_poppler_pdf_text", (DL_FUNC) &_pdftools_poppler_pdf_text, 3},
179+
{"_pdftools_poppler_pdf_text", (DL_FUNC) &_pdftools_poppler_pdf_text, 4},
179180
{"_pdftools_poppler_pdf_pagesize", (DL_FUNC) &_pdftools_poppler_pdf_pagesize, 3},
180181
{"_pdftools_poppler_pdf_fonts", (DL_FUNC) &_pdftools_poppler_pdf_fonts, 3},
181182
{"_pdftools_poppler_pdf_files", (DL_FUNC) &_pdftools_poppler_pdf_files, 3},

src/bindings.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,13 +260,13 @@ List poppler_pdf_data (RawVector x, bool get_font_info, std::string opw, std::st
260260
}
261261

262262
// [[Rcpp::export]]
263-
CharacterVector poppler_pdf_text (RawVector x, std::string opw, std::string upw) {
263+
CharacterVector poppler_pdf_text (RawVector x, std::string opw, std::string upw, bool raw = false) {
264264
std::unique_ptr<poppler::document> doc(read_raw_pdf(x, opw, upw));
265265
CharacterVector out(doc->pages());
266266
for(int i = 0; i < doc->pages(); i++){
267267
std::unique_ptr<poppler::page> p(doc->create_page(i));
268268
if(!p) continue; //missing page
269-
page::text_layout_enum show_text_layout = page::physical_layout;
269+
page::text_layout_enum show_text_layout = raw ? page::raw_order_layout : page::physical_layout;
270270

271271
/* media_box includes text in margins: https://github.com/ropensci/pdftools/issues/67 */
272272
rectf target(p->page_rect(media_box));

tests/testthat/test-reading.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ test_that("reading password protected pdf", {
1212

1313
# Get text with password
1414
expect_equal(4, length(pdf_text("pdf-example-password.original.pdf", upw = "test")))
15+
expect_equal(4, length(pdf_text("pdf-example-password.original.pdf", upw = "test", raw = TRUE)))
1516
expect_false(pdf_info("pdf-example-password.original.pdf", upw = "test")$locked)
1617

1718
# Reading 'encrypted' file

0 commit comments

Comments
 (0)