PDF to Text / Python Examples

This page contains various examples of using the PDF to Text API in Python. The examples are complete and fully functional. Read more about how to convert PDF to Text in Python.

Basic examples

PDF file to text file
PDF file to in-memory text
PDF file to text stream
PDF url to text file
PDF url to in-memory text
PDF url to text stream
In-memory PDF to text file
In-memory PDF to in-memory text
In-memory PDF to text stream
Get info about the current conversion

Django examples

PDF file to text in Django
PDF url to text in Django
In-memory PDF to text in Django

Flask examples

PDF file to text in Flask
PDF url to text in Flask
In-memory PDF to text in Flask

Basic examples

PDF file to text file

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and save the result to a file.
    client.convertFileToFile('/path/to/invoice.pdf', 'invoice.txt')

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

PDF file to in-memory text

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and store the result in the `txt` variable.
    txt = client.convertFile('/path/to/invoice.pdf')

    # at this point the "txt" variable contains TXT raw data and
    # can be sent in an HTTP response, saved to a file, etc.

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

PDF file to text stream

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Create an output stream for the conversion result
    output_stream = open('invoice.txt', 'wb')

    # run the conversion and write the result to the output stream.
    client.convertFileToStream('/path/to/invoice.pdf', output_stream)

    # Close the output stream.
    output_stream.close()

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

PDF url to text file

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and save the result to a file.
    client.convertUrlToFile('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', 'invoice.txt')

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

PDF url to in-memory text

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and store the result in the `txt` variable.
    txt = client.convertUrl('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf')

    # at this point the "txt" variable contains TXT raw data and
    # can be sent in an HTTP response, saved to a file, etc.

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

PDF url to text stream

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Create an output stream for the conversion result
    output_stream = open('invoice.txt', 'wb')

    # run the conversion and write the result to the output stream.
    client.convertUrlToStream('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', output_stream)

    # Close the output stream.
    output_stream.close()

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

In-memory PDF to text file

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and save the result to a file.
    client.convertRawDataToFile(open('/path/to/hello_world.pdf', 'rb').read(), 'invoice.txt')

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

In-memory PDF to in-memory text

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Run the conversion and store the result in the `txt` variable.
    txt = client.convertRawData(open('/path/to/hello_world.pdf', 'rb').read())

    # at this point the "txt" variable contains TXT raw data and
    # can be sent in an HTTP response, saved to a file, etc.

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

In-memory PDF to text stream

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Create an output stream for the conversion result
    output_stream = open('invoice.txt', 'wb')

    # run the conversion and write the result to the output stream.
    client.convertRawDataToStream(open('/path/to/hello_world.pdf', 'rb').read(), output_stream)

    # Close the output stream.
    output_stream.close()

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

Get info about the current conversion

import pdfcrowd
import sys

try:
    # Create an API client instance.
    client = pdfcrowd.PdfToTextClient('demo', 'demo')

    # Configure the conversion.
    client.setDebugLog(True)
    client.setPageBreakMode('default')

    # Run the conversion and save the result to a file.
    client.convertFileToFile('/path/to/invoice.pdf', 'invoice.txt')
    
    # print URL pointing to the debug log for this request.
    print('Debug log url: {}'.format(client.getDebugLogUrl()))
    
    # print number of conversion credits remaining in your account.
    print('Remaining credit count: {}'.format(client.getRemainingCreditCount()))
    
    # print number of credits consumed for this conversion.
    print('Consumed credit count: {}'.format(client.getConsumedCreditCount()))
    
    # print unique identifier assigned to this conversion job.
    print('Job id: {}'.format(client.getJobId()))
    
    # print total number of pages in the output document.
    print('Page count: {}'.format(client.getPageCount()))
    
    # print size of the output data in bytes.
    print('Output size: {}'.format(client.getOutputSize()))

except pdfcrowd.Error as why:
    sys.stderr.write('PDFCrowd Error: {}\n'.format(why))
    raise

Django examples

PDF file to text in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# The recommended method is POST.
@require_POST
def convert(request):
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Set HTTP response headers.
        response = HttpResponse(content_type='text/plain')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')

        # run the conversion and write the result to the output stream.
        client.convertFileToStream('/path/to/invoice.pdf', response)
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return HttpResponse(
            why, status=why.getStatusCode(), content_type='text/plain')

PDF url to text in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# The recommended method is POST.
@require_POST
def convert(request):
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Set HTTP response headers.
        response = HttpResponse(content_type='text/plain')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')

        # run the conversion and write the result to the output stream.
        client.convertUrlToStream('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', response)
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return HttpResponse(
            why, status=why.getStatusCode(), content_type='text/plain')

In-memory PDF to text in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# The recommended method is POST.
@require_POST
def convert(request):
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Set HTTP response headers.
        response = HttpResponse(content_type='text/plain')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')

        # run the conversion and write the result to the output stream.
        client.convertRawDataToStream(open('/path/to/hello_world.pdf', 'rb').read(), response)
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return HttpResponse(
            why, status=why.getStatusCode(), content_type='text/plain')

Flask examples

PDF file to text in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# The recommended method is POST.
@app.route('/', methods=['POST'])
def convert():
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Run the conversion and store the result in the `txt` variable.
        txt = client.convertFile('/path/to/invoice.pdf');

        # Send the result and set HTTP response headers.
        response = Response(txt, mimetype='text/plain')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return Response(
            str(why), status=why.getStatusCode(), mimetype='text/plain')

PDF url to text in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# The recommended method is POST.
@app.route('/', methods=['POST'])
def convert():
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Run the conversion and store the result in the `txt` variable.
        txt = client.convertUrl('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf');

        # Send the result and set HTTP response headers.
        response = Response(txt, mimetype='text/plain')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return Response(
            str(why), status=why.getStatusCode(), mimetype='text/plain')

In-memory PDF to text in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# The recommended method is POST.
@app.route('/', methods=['POST'])
def convert():
    try:
        # Create an API client instance.
        client = pdfcrowd.PdfToTextClient('demo', 'demo')

        # Run the conversion and store the result in the `txt` variable.
        txt = client.convertRawData(open('/path/to/hello_world.pdf', 'rb').read());

        # Send the result and set HTTP response headers.
        response = Response(txt, mimetype='text/plain')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.txt', safe='')
        return response
    except pdfcrowd.Error as why:
        # Send the error in the HTTP response.
        return Response(
            str(why), status=why.getStatusCode(), mimetype='text/plain')