convert_html_to_pdf.py

#! /usr/bin/python3
# ===================================================================
# create combined pdf file from html files
#
# Based on:      www.geeksforgeeks.org/python-convert-html-pdf/
# Documentation: pdfkit.org/docs/guide.pdf
#
# Based on:      stackoverflow.com/questions/3444645/merge-pdf-files
#
# look at...
#     www.programcreek.com/python/example/100586/pdfkit.from_file
# ===================================================================

import pdfkit
from PyPDF2 import PdfFileMerger
import os, sys

options = {
    'page-size': 'Letter',
    'margin-top': '0.5in',
    'margin-right': '0.5in',
    'margin-bottom': '0.5in',
    'margin-left': '0.5in',
    'encoding': "UTF-8",
    'custom-header' : [
        ('Accept-Encoding', 'gzip')
    ],
    'cookie': [
        ('cookie-name1', 'cookie-value1'),
        ('cookie-name2', 'cookie-value2'),
    ],
    'no-outline': None
}

htmlfls = [       './test/index.html',
            './test/project_001.html',
            './test/project_002.html',
            './test/project_003.html',
            './test/project_004.html',
            './test/project_005.html',
            './test/project_006.html',
            './test/project_008.html',
            './test/project_010.html',
            './test/project_011.html',
            './test/project_015.html',
            './test/project_016.html',
            './test/project_017.html',
            './test/project_018.html',
            './test/project_019a.html',
            './test/project_019b.html',
            './test/project_019.html',
            './test/project_020.html',
            './test/project_021.html',
            './test/project_022.html',
            './test/project_100.html',
            './test/project_507.html'  ]


# -------------------------------------------------------------------
# ---- function: file exists
# -------------------------------------------------------------------

def file_exists(f):
    if os.path.exists(f):
        return True
    print(f'The file {f} does not exist')
    return False

# -------------------------------------------------------------------
# ---- function: delete files
# -------------------------------------------------------------------

def delete_files(lst):
    cnt = 0                    # file count                       
    for f in lst:
        if file_exists(f):
            os.remove(f)
            cnt += 1
        else:
            print(f'The file {f} does not exist') 
            return (False,cnt)
    return (True,cnt)

# -------------------------------------------------------------------
# ---- function: merge pdf files into a single file
# -------------------------------------------------------------------

def merge_pdf_files(pdfs,outfile='merged.pdf'):

    cnt = 0                    # pdf file count

    merger = PdfFileMerger()

    for f in pdfs:
        merger.append(f)
        cnt += 1

    merger.write(outfile)
    merger.close()

    return cnt

# -------------------------------------------------------------------
# ---- function: create pdf files from html files
# -------------------------------------------------------------------

def create_pdf_from_html(lst):

    cnt  = 0                   # file count
    pdfs = []                  # pdf file names

    for f in lst:

        cnt += 1                     # count input files

        print(f'[{cnt:03}] {f}')     # display count and input file

        if not file_exists(f):       # file exists?
            return (pdfs,False)

        # ---- convert html file to pdf file
        # ---- use try...erxcept... to skip errors
        # ---- there is a bug in 'pdfkit' that seems to have
        # ----   no effect on the outout file but crashes
        # ----   the program

        ff = f'./file_{cnt:03}.pdf'  # output file

        try:
            pdfkit.from_file(f, ff, options=options)
        except:
            pass 

        pdfs.append(ff)       # save output file names for 
                              #   later processing

    return (pdfs,cnt)


# -------------------------------------------------------------------
# ---- function: get list files in directory
# -------------------------------------------------------------------

def get_list_files_in_dir(directory):

    files = os.listdir(directory)

    return files

# -------------------------------------------------------------------
# ---- function: display list files in directory
# -------------------------------------------------------------------

def display_files_in_dir(directory):

    files = get_list_files_in_dir(directory)

    print()
    print('---files-in-dir----------------------------------')
    for f in files:
        print(f)
    print('-------------------------------------------------')

# -------------------------------------------------------------------
# ---- main
# -------------------------------------------------------------------

if __name__ == '__main__':

    print()
    print('---- create initial pdf files from html files')

    (pdfs,cnt) = create_pdf_from_html(htmlfls)

    print(f'created {cnt} pdf files')


    print()
    print('---- merge pdf files into a single pdf file')

    cnt = merge_pdf_files(pdfs)

    print(f'merged {cnt} pdf files')


    print()
    print('---- delete initial pdf files')

    (tf,cnt) = delete_files(pdfs)

    print(f'deleted {cnt} initial pdf files')


    # ---- exit program

    print()