PDF/Office Converter Python Tutorial

11 min

PDF/Office Conversion Tool Python Tutorial

1. Environment Setup

First, install the necessary libraries:

# PDF operation libraries
pip install PyPDF2==3.0.0
pip install pdf2image==1.16.0
pip install pdfplumber==0.10.0

# Office operation libraries
pip install python-docx==1.1.0
pip install openpyxl==3.1.2
pip install python-pptx==0.6.21

# Image processing
pip install pillow==10.0.0

# For PDF to image conversion and OCR
pip install pytesseract==0.3.10

2. Complete Conversion Tool Code

"""
PDF/Office Conversion Tool
Supported formats: PDF, Word, Excel, PPT
Implemented using free open-source libraries
"""

import os
import sys
from pathlib import Path
import logging
from typing import Union, Optional

# PDF processing
import PyPDF2
from pdf2image import convert_from_path
import pdfplumber

# Office processing
from docx import Document
from docx.shared import Inches
import openpyxl
from openpyxl import load_workbook
from pptx import Presentation
from PIL import Image

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class PDFToOffice:
    """PDF to Office conversion utility class"""
    
    @staticmethod
    def pdf_to_word(pdf_path: str, word_path: str) -> bool:
        """
        Convert PDF to Word document
        
        Args:
            pdf_path: Path to PDF file
            word_path: Path to output Word file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting PDF to Word conversion: {pdf_path}")
            
            # Create Word document
            doc = Document()
            
            # Extract text using pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Add page number heading
                    doc.add_heading(f'Page {page_num}', level=2)
                    
                    # Extract text
                    text = page.extract_text()
                    if text:
                        doc.add_paragraph(text)
                    
                    # Extract tables
                    tables = page.extract_tables()
                    for table in tables:
                        if table:
                            # Create table in Word
                            word_table = doc.add_table(rows=len(table), cols=len(table[0]))
                            for i, row in enumerate(table):
                                for j, cell in enumerate(row):
                                    if cell:
                                        word_table.cell(i, j).text = str(cell)
                            doc.add_paragraph()  # Add empty line
                    
                    # Page break
                    if page_num < len(pdf.pages):
                        doc.add_page_break()
            
            # Save Word document
            doc.save(word_path)
            logger.info(f"PDF to Word conversion complete: {word_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF to Word conversion failed: {str(e)}")
            return False
    
    @staticmethod
    def pdf_to_excel(pdf_path: str, excel_path: str) -> bool:
        """
        Convert PDF to Excel (mainly extract tables)
        
        Args:
            pdf_path: Path to PDF file
            excel_path: Path to output Excel file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting PDF to Excel conversion: {pdf_path}")
            
            # Create Excel workbook
            wb = openpyxl.Workbook()
            
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Create a worksheet for each page
                    ws = wb.create_sheet(title=f"Page{page_num}")
                    
                    # Extract tables
                    tables = page.extract_tables()
                    if tables:
                        for table in tables:
                            for row_idx, row in enumerate(table, 1):
                                for col_idx, cell in enumerate(row, 1):
                                    if cell:
                                        ws.cell(row=row_idx, column=col_idx, value=str(cell))
                    
                    # Extract text
                    text = page.extract_text()
                    if text:
                        ws.cell(row=len(tables) + 2, column=1, value="Extracted text:")
                        ws.cell(row=len(tables) + 3, column=1, value=text)
            
            # Delete default worksheet
            if "Sheet" in wb.sheetnames:
                wb.remove(wb["Sheet"])
            
            wb.save(excel_path)
            logger.info(f"PDF to Excel conversion complete: {excel_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF to Excel conversion failed: {str(e)}")
            return False
    
    @staticmethod
    def pdf_to_ppt(pdf_path: str, ppt_path: str) -> bool:
        """
        Convert PDF to PPT (each page as a slide)
        
        Args:
            pdf_path: Path to PDF file
            ppt_path: Path to output PPT file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting PDF to PPT conversion: {pdf_path}")
            
            # Create PPT
            prs = Presentation()
            
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Add slide
                    slide_layout = prs.slide_layouts[5]  # Blank layout
                    slide = prs.slides.add_slide(slide_layout)
                    
                    # Add title
                    title = slide.shapes.title
                    if title:
                        title.text = f"Page {page_num}"
                    
                    # Add text content
                    text = page.extract_text()
                    if text:
                        textbox = slide.shapes.add_textbox(
                            left=Inches(1), top=Inches(1.5),
                            width=Inches(8), height=Inches(5)
                        )
                        text_frame = textbox.text_frame
                        text_frame.text = text[:500]  # Limit text length
            
            prs.save(ppt_path)
            logger.info(f"PDF to PPT conversion complete: {ppt_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF to PPT conversion failed: {str(e)}")
            return False


class OfficeToPDF:
    """Office to PDF conversion utility class"""
    
    @staticmethod
    def word_to_pdf(word_path: str, pdf_path: str) -> bool:
        """
        Convert Word to PDF (via print method)
        
        Args:
            word_path: Path to Word file
            pdf_path: Path to output PDF file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting Word to PDF conversion: {word_path}")
            
            # Read Word document
            doc = Document(word_path)
            
            # Create PDF writer
            pdf_writer = PyPDF2.PdfWriter()
            
            # Since PyPDF2 doesn't support direct text writing, create a simple PDF
            # In practice, consider using libraries like reportlab for more complex PDFs
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import letter
            from reportlab.pdfbase import pdfmetrics
            from reportlab.pdfbase.ttfonts import TTFont
            
            # Create temporary PDF
            temp_pdf = "temp_word_output.pdf"
            c = canvas.Canvas(temp_pdf, pagesize=letter)
            
            y_position = 750  # Starting Y position
            line_height = 15  # Line height
            
            # Add document content
            for paragraph in doc.paragraphs:
                if paragraph.text:
                    # Handle line breaks
                    lines = paragraph.text.split('\n')
                    for line in lines:
                        if line.strip():
                            c.drawString(50, y_position, line[:100])  # Limit line length
                            y_position -= line_height
                            
                            # If page is full, create new page
                            if y_position < 50:
                                c.showPage()
                                y_position = 750
            
            c.save()
            
            # Read temporary PDF and add to final PDF
            with open(temp_pdf, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages:
                    pdf_writer.add_page(page)
            
            # Save final PDF
            with open(pdf_path, 'wb') as f:
                pdf_writer.write(f)
            
            # Delete temporary file
            os.remove(temp_pdf)
            
            logger.info(f"Word to PDF conversion complete: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"Word to PDF conversion failed: {str(e)}")
            return False
    
    @staticmethod
    def excel_to_pdf(excel_path: str, pdf_path: str) -> bool:
        """
        Convert Excel to PDF
        
        Args:
            excel_path: Path to Excel file
            pdf_path: Path to output PDF file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting Excel to PDF conversion: {excel_path}")
            
            # Read Excel file
            wb = load_workbook(excel_path)
            
            # Create PDF using reportlab
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import landscape, letter
            
            c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
            
            y_position = 550
            line_height = 20
            
            # Iterate through each worksheet
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                
                # Add worksheet title
                c.setFont("Helvetica-Bold", 14)
                c.drawString(50, y_position, f"Worksheet: {sheet_name}")
                y_position -= line_height + 10
                
                # Set table font
                c.setFont("Helvetica", 10)
                
                # Get data range
                data = []
                for row in ws.iter_rows(values_only=True):
                    data.append([str(cell) if cell is not None else "" for cell in row])
                
                # Draw table
                if data:
                    col_width = 80
                    row_height = 15
                    
                    for row_idx, row in enumerate(data):
                        for col_idx, cell in enumerate(row[:8]):  # Limit columns
                            x = 50 + col_idx * col_width
                            y = y_position - row_idx * row_height
                            
                            # Draw cell border
                            c.rect(x, y - row_height, col_width, row_height)
                            
                            # Write cell content
                            c.drawString(x + 5, y - row_height + 5, cell[:15])  # Limit text length
                        
                        # If page is full, create new page
                        if y_position - (row_idx + 1) * row_height < 50:
                            c.showPage()
                            y_position = 750
                            c.setFont("Helvetica", 10)
                
                y_position -= len(data) * row_height + 30
                
                # If new page is needed
                if y_position < 100:
                    c.showPage()
                    y_position = 750
            
            c.save()
            
            logger.info(f"Excel to PDF conversion complete: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"Excel to PDF conversion failed: {str(e)}")
            return False
    
    @staticmethod
    def ppt_to_pdf(ppt_path: str, pdf_path: str) -> bool:
        """
        Convert PPT to PDF
        
        Args:
            ppt_path: Path to PPT file
            pdf_path: Path to output PDF file
            
        Returns:
            bool: Whether conversion was successful
        """
        try:
            logger.info(f"Starting PPT to PDF conversion: {ppt_path}")
            
            # Read PPT file
            prs = Presentation(ppt_path)
            
            # Create PDF using reportlab
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import landscape, letter
            
            c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
            
            for slide_num, slide in enumerate(prs.slides, 1):
                # Add slide title
                c.setFont("Helvetica-Bold", 16)
                c.drawString(50, 550, f"Slide {slide_num}")
                
                # Add slide content
                c.setFont("Helvetica", 12)
                y_position = 500
                
                # Extract text from slide
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        text_lines = shape.text.split('\n')
                        for line in text_lines:
                            if line.strip():
                                c.drawString(70, y_position, line[:100])
                                y_position -= 20
                
                # Add new page (except for last page)
                if slide_num < len(prs.slides):
                    c.showPage()
            
            c.save()
            
            logger.info(f"PPT to PDF conversion complete: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"PPT to PDF conversion failed: {str(e)}")
            return False


class FileConverter:
    """Main file converter class"""
    
    def __init__(self):
        self.pdf_to_office = PDFToOffice()
        self.office_to_pdf = OfficeToPDF()
    
    def convert(self, input_path: str, output_path: str, conversion_type: str) -> bool:
        """
        Perform file conversion
        
        Args:
            input_path: Path to input file
            output_path: Path to output file
            conversion_type: Conversion type, supports:
                'pdf2word', 'pdf2excel', 'pdf2ppt',
                'word2pdf', 'excel2pdf', 'ppt2pdf'
        
        Returns:
            bool: Whether conversion was successful
        """
        # Check if input file exists
        if not os.path.exists(input_path):
            logger.error(f"Input file does not exist: {input_path}")
            return False
        
        # Create output directory
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Execute conversion based on type
        converters = {
            'pdf2word': self.pdf_to_office.pdf_to_word,
            'pdf2excel': self.pdf_to_office.pdf_to_excel,
            'pdf2ppt': self.pdf_to_office.pdf_to_ppt,
            'word2pdf': self.office_to_pdf.word_to_pdf,
            'excel2pdf': self.office_to_pdf.excel_to_pdf,
            'ppt2pdf': self.office_to_pdf.ppt_to_pdf
        }
        
        if conversion_type not in converters:
            logger.error(f"Unsupported conversion type: {conversion_type}")
            return False
        
        return converters[conversion_type](input_path, output_path)


def main():
    """Main function"""
    converter = FileConverter()
    
    # Example usage
    examples = [
        # PDF to Office
        ("example.pdf", "output.docx", "pdf2word"),
        ("example.pdf", "output.xlsx", "pdf2excel"),
        ("example.pdf", "output.pptx", "pdf2ppt"),
        
        # Office to PDF
        ("example.docx", "output.pdf", "word2pdf"),
        ("example.xlsx", "output.pdf", "excel2pdf"),
        ("example.pptx", "output.pdf", "ppt2pdf"),
    ]
    
    # Print usage instructions
    print("=" * 60)
    print("PDF/Office Conversion Tool")
    print("=" * 60)
    print("\nSupported conversion types:")
    print("  1. PDF → Word   (pdf2word)")
    print("  2. PDF → Excel  (pdf2excel)")
    print("  3. PDF → PPT    (pdf2ppt)")
    print("  4. Word → PDF   (word2pdf)")
    print("  5. Excel → PDF  (excel2pdf)")
    print("  6. PPT → PDF    (ppt2pdf)")
    print("\nUsage examples:")
    print("  converter.convert('input.pdf', 'output.docx', 'pdf2word')")
    print("  converter.convert('input.docx', 'output.pdf', 'word2pdf')")
    print("=" * 60)
    
    # Prompt user for input
    input_file = input("\nEnter input file path: ").strip()
    output_file = input("Enter output file path: ").strip()
    print("Select conversion type:")
    print("1. PDF to Word")
    print("2. PDF to Excel")
    print("3. PDF to PPT")
    print("4. Word to PDF")
    print("5. Excel to PDF")
    print("6. PPT to PDF")
    
    choice = input("Enter number (1-6): ").strip()
    
    type_map = {
        '1': 'pdf2word',
        '2': 'pdf2excel',
        '3': 'pdf2ppt',
        '4': 'word2pdf',
        '5': 'excel2pdf',
        '6': 'ppt2pdf'
    }
    
    if choice in type_map:
        success = converter.convert(input_file, output_file, type_map[choice])
        if success:
            print(f"\n✅ Conversion successful! Output file: {output_file}")
        else:
            print(f"\n❌ Conversion failed, please check the logs.")
    else:
        print("Invalid choice!")


if __name__ == "__main__":
    main()

3. Enhanced Converter (Using More Libraries)

If you need higher quality conversion, install these enhanced libraries:

# Better PDF processing
pip install pdf2docx==0.5.6  # PDF to Word
pip install camelot-py==0.11.0  # PDF table extraction
pip install tabula-py==2.8.0  # PDF table extraction

# Better Office to PDF
pip install comtypes==1.1.14  # Windows Office COM component usage
pip install win32com  # Windows automation

Enhanced code example:

"""
Enhanced Converter - Using More Specialized Libraries
"""

import os
import sys
from pathlib import Path

# Enhanced PDF to Word
try:
    from pdf2docx import Converter
except ImportError:
    print("Please install pdf2docx: pip install pdf2docx")

# Enhanced PDF table extraction
try:
    import camelot
except ImportError:
    print("Please install camelot-py: pip install camelot-py")

# Windows Office COM component usage
if sys.platform == 'win32':
    try:
        import win32com.client
    except ImportError:
        print("Please install pywin32: pip install pywin32")


class EnhancedConverter:
    """Enhanced converter class"""
    
    @staticmethod
    def pdf_to_word_enhanced(pdf_path: str, word_path: str) -> bool:
        """
        High-quality PDF to Word conversion using pdf2docx
        """
        try:
            cv = Converter(pdf_path)
            cv.convert(word_path, start=0, end=None)
            cv.close()
            return True
        except Exception as e:
            print(f"Conversion failed: {e}")
            return False
    
    @staticmethod
    def word_to_pdf_enhanced(word_path: str, pdf_path: str) -> bool:
        """
        High-quality conversion using Word COM component on Windows
        """
        if sys.platform != 'win32':
            print("This feature is only supported on Windows systems")
            return False
        
        try:
            word = win32com.client.Dispatch("Word.Application")
            word.Visible = False
            
            doc = word.Documents.Open(os.path.abspath(word_path))
            doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17)  # 17 = PDF format
            doc.Close()
            
            word.Quit()
            return True
        except Exception as e:
            print(f"Conversion failed: {e}")
            return False


# Usage example
if __name__ == "__main__":
    converter = EnhancedConverter()
    
    # PDF to Word
    converter.pdf_to_word_enhanced("input.pdf", "output.docx")
    
    # Word to PDF (Windows)
    converter.word_to_pdf_enhanced("input.docx", "output.pdf")

4. Batch Conversion Tool

"""
Batch File Conversion Tool
"""

import os
import glob
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed


class BatchConverter:
    """Batch converter class"""
    
    def __init__(self, converter):
        self.converter = converter
    
    def batch_convert(self, input_dir: str, output_dir: str, 
                     input_ext: str, output_ext: str, conversion_type: str):
        """
        Batch convert all files in a folder
        
        Args:
            input_dir: Input folder
            output_dir: Output folder
            input_ext: Input file extension (e.g., '.pdf')
            output_ext: Output file extension (e.g., '.docx')
            conversion_type: Conversion type
        """
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Get all input files
        input_files = glob.glob(os.path.join(input_dir, f"*{input_ext}"))
        
        if not input_files:
            print(f"No {input_ext} files found in {input_dir}")
            return
        
        print(f"Found {len(input_files)} files, starting conversion...")
        
        # Use thread pool for parallel conversion
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = []
            
            for input_file in input_files:
                # Generate output filename
                base_name = Path(input_file).stem
                output_file = os.path.join(output_dir, f"{base_name}{output_ext}")
                
                # Submit conversion task
                future = executor.submit(
                    self.converter.convert,
                    input_file,
                    output_file,
                    conversion_type
                )
                futures.append((future, input_file, output_file))
            
            # Wait for all tasks to complete
            for future, input_file, output_file in futures:
                try:
                    success = future.result()
                    status = "✓" if success else "✗"
                    print(f"[{status}] {Path(input_file).name} -> {Path(output_file).name}")
                except Exception as e:
                    print(f"[✗] {Path(input_file).name} conversion failed: {e}")


# Usage example
if __name__ == "__main__":
    from main import FileConverter
    
    converter = FileConverter()
    batch = BatchConverter(converter)
    
    # Batch convert PDF to Word
    batch.batch_convert(
        input_dir="./pdf_files",
        output_dir="./word_files",
        input_ext=".pdf",
        output_ext=".docx",
        conversion_type="pdf2word"
    )
    
    # Batch convert Word to PDF
    batch.batch_convert(
        input_dir="./word_files",
        output_dir="./pdf_output",
        input_ext=".docx",
        output_ext=".pdf",
        conversion_type="word2pdf"
    )

5. Usage Instructions

  1. Install Dependencies:

    pip install PyPDF2 pdf2image pdfplumber python-docx openpyxl python-pptx pillow
  2. Basic Usage:

    from converter import FileConverter
    
    converter = FileConverter()
    
    # PDF to Word
    converter.convert("input.pdf", "output.docx", "pdf2word")
    
    # Word to PDF
    converter.convert("input.docx", "output.pdf", "word2pdf")
  3. Important Notes:

    • These libraries are all free and open-source
    • Conversion quality may not match commercial software
    • For complex document formats, consider using the enhanced version or commercial solutions
    • When processing Chinese text, you may need to install Chinese fonts