PDF to HTML Python Examples

This page contains various examples of using the PDF to HTML API in Python. The examples are complete and fully functional. Read more about how to convert PDF to HTML in Python.

Basic examples
Django examples
Flask examples

Basic examples

PDF file to HTML file

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and write the result to a file
    client.convertFileToFile('/path/to/logo.pdf', 'logo.html')
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

PDF file to in-memory HTML

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and store the result into the "html" variable
    html = client.convertFile('/path/to/logo.pdf')

    # at this point the "html" variable contains HTML raw data and
    # can be sent in an HTTP response, saved to a file, etc.
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

PDF file to HTML stream

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # create an output stream for the conversion result
    output_stream = open('logo.html', 'wb')

    # run the conversion and write the result into the output stream
    client.convertFileToStream('/path/to/logo.pdf', output_stream)

    # close the output stream
    output_stream.close()
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

PDF url to HTML file

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and write the result to a file
    client.convertUrlToFile('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', 'invoice.html')
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

PDF url to in-memory HTML

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and store the result into the "html" variable
    html = client.convertUrl('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf')

    # at this point the "html" variable contains HTML raw data and
    # can be sent in an HTTP response, saved to a file, etc.
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

PDF url to HTML stream

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # create an output stream for the conversion result
    output_stream = open('invoice.html', 'wb')

    # run the conversion and write the result into the output stream
    client.convertUrlToStream('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', output_stream)

    # close the output stream
    output_stream.close()
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

In-memory PDF to HTML file

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and write the result to a file
    client.convertRawDataToFile(open('/path/to/hello_world.pdf', 'rb').read(), 'logo.html')
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

In-memory PDF to in-memory HTML

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # run the conversion and store the result into the "html" variable
    html = client.convertRawData(open('/path/to/hello_world.pdf', 'rb').read())

    # at this point the "html" variable contains HTML raw data and
    # can be sent in an HTTP response, saved to a file, etc.
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

In-memory PDF to HTML stream

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # create an output stream for the conversion result
    output_stream = open('logo.html', 'wb')

    # run the conversion and write the result into the output stream
    client.convertRawDataToStream(open('/path/to/hello_world.pdf', 'rb').read(), output_stream)

    # close the output stream
    output_stream.close()
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

Get info about the current conversion

import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

    # configure the conversion
    client.setDebugLog(True)

    # run the conversion and write the result to a file
    client.convertFileToFile('/path/to/logo.pdf', 'logo.html')
    
    # print URL of the debug log
    print('Debug log url: {}'.format(client.getDebugLogUrl()))
    
    # print the number of conversion credits remaining in your account
    print('Remaining credit count: {}'.format(client.getRemainingCreditCount()))
    
    # print the number of credits used for the conversion
    print('Consumed credit count: {}'.format(client.getConsumedCreditCount()))
    
    # print the unique identifier for the conversion
    print('Job id: {}'.format(client.getJobId()))
    
    # print total number of pages in the output document
    print('Page count: {}'.format(client.getPageCount()))
    
    # print size of the output data in bytes
    print('Output size: {}'.format(client.getOutputSize()))
    
except pdfcrowd.Error as why:
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))
    raise

Django examples

PDF file to HTML in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# the recommended method is POST
@require_POST
def convert(request):
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # set HTTP response headers
        response = HttpResponse(content_type='text/html')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('logo.html', safe='')

        # run the conversion and write the result into the output stream
        client.convertFileToStream('/path/to/logo.pdf', response)
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return HttpResponse(why.getMessage(),
                            status=why.getCode(),
                            content_type='text/plain')

PDF url to HTML in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# the recommended method is POST
@require_POST
def convert(request):
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # set HTTP response headers
        response = HttpResponse(content_type='text/html')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.html', safe='')

        # run the conversion and write the result into the output stream
        client.convertUrlToStream('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf', response)
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return HttpResponse(why.getMessage(),
                            status=why.getCode(),
                            content_type='text/plain')

In-memory PDF to HTML in Django

import urllib.parse
from django.http import HttpResponse
from django.views.decorators.http import require_POST
import pdfcrowd

# the recommended method is POST
@require_POST
def convert(request):
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # set HTTP response headers
        response = HttpResponse(content_type='text/html')
        response['Cache-Control'] = 'max-age=0'
        response['Accept-Ranges'] = 'none'
        response['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('logo.html', safe='')

        # run the conversion and write the result into the output stream
        client.convertRawDataToStream(open('/path/to/hello_world.pdf', 'rb').read(), response)
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return HttpResponse(why.getMessage(),
                            status=why.getCode(),
                            content_type='text/plain')

Flask examples

PDF file to HTML in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# the recommended method is POST
@app.route('/', methods=['POST'])
def convert():
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # run the conversion and store the result into the "html" variable
        html = client.convertFile('/path/to/logo.pdf');

        # send the result and set HTTP response headers
        response = Response(html, mimetype='text/html')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('logo.html', safe='')
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return Response(why.getMessage(),
                        status=why.getCode(),
                        mimetype='text/plain')

PDF url to HTML in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# the recommended method is POST
@app.route('/', methods=['POST'])
def convert():
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # run the conversion and store the result into the "html" variable
        html = client.convertUrl('https://pdfcrowd.com/static/pdf/apisamples/invoice.pdf');

        # send the result and set HTTP response headers
        response = Response(html, mimetype='text/html')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('invoice.html', safe='')
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return Response(why.getMessage(),
                        status=why.getCode(),
                        mimetype='text/plain')

In-memory PDF to HTML in Flask

import urllib.parse
from flask import Flask, Response
import pdfcrowd

app = Flask(__name__)

# the recommended method is POST
@app.route('/', methods=['POST'])
def convert():
    try:
        # create the API client instance
        client = pdfcrowd.PdfToHtmlClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d')

        # run the conversion and store the result into the "html" variable
        html = client.convertRawData(open('/path/to/hello_world.pdf', 'rb').read());

        # send the result and set HTTP response headers
        response = Response(html, mimetype='text/html')
        response.headers['Cache-Control'] = 'max-age=0'
        response.headers['Accept-Ranges'] = 'none'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + urllib.parse.quote('logo.html', safe='')
        return response
    except pdfcrowd.Error as why:
        # send the error in the HTTP response
        return Response(why.getMessage(),
                        status=why.getCode(),
                        mimetype='text/plain')