PDF/Office Converter Python Tutorial
11 min
| note
PDF/Office Conversion Tool Python Tutorial
1. Environment Setup
First, install the necessary libraries:
# PDF operation libraries
pip install PyPDF2==3.0.0
pip install pdf2image==1.16.0
pip install pdfplumber==0.10.0
# Office operation libraries
pip install python-docx==1.1.0
pip install openpyxl==3.1.2
pip install python-pptx==0.6.21
# Image processing
pip install pillow==10.0.0
# For PDF to image conversion and OCR
pip install pytesseract==0.3.102. Complete Conversion Tool Code
"""
PDF/Office Conversion Tool
Supported formats: PDF, Word, Excel, PPT
Implemented using free open-source libraries
"""
import os
import sys
from pathlib import Path
import logging
from typing import Union, Optional
# PDF processing
import PyPDF2
from pdf2image import convert_from_path
import pdfplumber
# Office processing
from docx import Document
from docx.shared import Inches
import openpyxl
from openpyxl import load_workbook
from pptx import Presentation
from PIL import Image
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class PDFToOffice:
"""PDF to Office conversion utility class"""
@staticmethod
def pdf_to_word(pdf_path: str, word_path: str) -> bool:
"""
Convert PDF to Word document
Args:
pdf_path: Path to PDF file
word_path: Path to output Word file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting PDF to Word conversion: {pdf_path}")
# Create Word document
doc = Document()
# Extract text using pdfplumber
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# Add page number heading
doc.add_heading(f'Page {page_num}', level=2)
# Extract text
text = page.extract_text()
if text:
doc.add_paragraph(text)
# Extract tables
tables = page.extract_tables()
for table in tables:
if table:
# Create table in Word
word_table = doc.add_table(rows=len(table), cols=len(table[0]))
for i, row in enumerate(table):
for j, cell in enumerate(row):
if cell:
word_table.cell(i, j).text = str(cell)
doc.add_paragraph() # Add empty line
# Page break
if page_num < len(pdf.pages):
doc.add_page_break()
# Save Word document
doc.save(word_path)
logger.info(f"PDF to Word conversion complete: {word_path}")
return True
except Exception as e:
logger.error(f"PDF to Word conversion failed: {str(e)}")
return False
@staticmethod
def pdf_to_excel(pdf_path: str, excel_path: str) -> bool:
"""
Convert PDF to Excel (mainly extract tables)
Args:
pdf_path: Path to PDF file
excel_path: Path to output Excel file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting PDF to Excel conversion: {pdf_path}")
# Create Excel workbook
wb = openpyxl.Workbook()
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# Create a worksheet for each page
ws = wb.create_sheet(title=f"Page{page_num}")
# Extract tables
tables = page.extract_tables()
if tables:
for table in tables:
for row_idx, row in enumerate(table, 1):
for col_idx, cell in enumerate(row, 1):
if cell:
ws.cell(row=row_idx, column=col_idx, value=str(cell))
# Extract text
text = page.extract_text()
if text:
ws.cell(row=len(tables) + 2, column=1, value="Extracted text:")
ws.cell(row=len(tables) + 3, column=1, value=text)
# Delete default worksheet
if "Sheet" in wb.sheetnames:
wb.remove(wb["Sheet"])
wb.save(excel_path)
logger.info(f"PDF to Excel conversion complete: {excel_path}")
return True
except Exception as e:
logger.error(f"PDF to Excel conversion failed: {str(e)}")
return False
@staticmethod
def pdf_to_ppt(pdf_path: str, ppt_path: str) -> bool:
"""
Convert PDF to PPT (each page as a slide)
Args:
pdf_path: Path to PDF file
ppt_path: Path to output PPT file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting PDF to PPT conversion: {pdf_path}")
# Create PPT
prs = Presentation()
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# Add slide
slide_layout = prs.slide_layouts[5] # Blank layout
slide = prs.slides.add_slide(slide_layout)
# Add title
title = slide.shapes.title
if title:
title.text = f"Page {page_num}"
# Add text content
text = page.extract_text()
if text:
textbox = slide.shapes.add_textbox(
left=Inches(1), top=Inches(1.5),
width=Inches(8), height=Inches(5)
)
text_frame = textbox.text_frame
text_frame.text = text[:500] # Limit text length
prs.save(ppt_path)
logger.info(f"PDF to PPT conversion complete: {ppt_path}")
return True
except Exception as e:
logger.error(f"PDF to PPT conversion failed: {str(e)}")
return False
class OfficeToPDF:
"""Office to PDF conversion utility class"""
@staticmethod
def word_to_pdf(word_path: str, pdf_path: str) -> bool:
"""
Convert Word to PDF (via print method)
Args:
word_path: Path to Word file
pdf_path: Path to output PDF file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting Word to PDF conversion: {word_path}")
# Read Word document
doc = Document(word_path)
# Create PDF writer
pdf_writer = PyPDF2.PdfWriter()
# Since PyPDF2 doesn't support direct text writing, create a simple PDF
# In practice, consider using libraries like reportlab for more complex PDFs
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# Create temporary PDF
temp_pdf = "temp_word_output.pdf"
c = canvas.Canvas(temp_pdf, pagesize=letter)
y_position = 750 # Starting Y position
line_height = 15 # Line height
# Add document content
for paragraph in doc.paragraphs:
if paragraph.text:
# Handle line breaks
lines = paragraph.text.split('\n')
for line in lines:
if line.strip():
c.drawString(50, y_position, line[:100]) # Limit line length
y_position -= line_height
# If page is full, create new page
if y_position < 50:
c.showPage()
y_position = 750
c.save()
# Read temporary PDF and add to final PDF
with open(temp_pdf, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
for page in pdf.pages:
pdf_writer.add_page(page)
# Save final PDF
with open(pdf_path, 'wb') as f:
pdf_writer.write(f)
# Delete temporary file
os.remove(temp_pdf)
logger.info(f"Word to PDF conversion complete: {pdf_path}")
return True
except Exception as e:
logger.error(f"Word to PDF conversion failed: {str(e)}")
return False
@staticmethod
def excel_to_pdf(excel_path: str, pdf_path: str) -> bool:
"""
Convert Excel to PDF
Args:
excel_path: Path to Excel file
pdf_path: Path to output PDF file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting Excel to PDF conversion: {excel_path}")
# Read Excel file
wb = load_workbook(excel_path)
# Create PDF using reportlab
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import landscape, letter
c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
y_position = 550
line_height = 20
# Iterate through each worksheet
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
# Add worksheet title
c.setFont("Helvetica-Bold", 14)
c.drawString(50, y_position, f"Worksheet: {sheet_name}")
y_position -= line_height + 10
# Set table font
c.setFont("Helvetica", 10)
# Get data range
data = []
for row in ws.iter_rows(values_only=True):
data.append([str(cell) if cell is not None else "" for cell in row])
# Draw table
if data:
col_width = 80
row_height = 15
for row_idx, row in enumerate(data):
for col_idx, cell in enumerate(row[:8]): # Limit columns
x = 50 + col_idx * col_width
y = y_position - row_idx * row_height
# Draw cell border
c.rect(x, y - row_height, col_width, row_height)
# Write cell content
c.drawString(x + 5, y - row_height + 5, cell[:15]) # Limit text length
# If page is full, create new page
if y_position - (row_idx + 1) * row_height < 50:
c.showPage()
y_position = 750
c.setFont("Helvetica", 10)
y_position -= len(data) * row_height + 30
# If new page is needed
if y_position < 100:
c.showPage()
y_position = 750
c.save()
logger.info(f"Excel to PDF conversion complete: {pdf_path}")
return True
except Exception as e:
logger.error(f"Excel to PDF conversion failed: {str(e)}")
return False
@staticmethod
def ppt_to_pdf(ppt_path: str, pdf_path: str) -> bool:
"""
Convert PPT to PDF
Args:
ppt_path: Path to PPT file
pdf_path: Path to output PDF file
Returns:
bool: Whether conversion was successful
"""
try:
logger.info(f"Starting PPT to PDF conversion: {ppt_path}")
# Read PPT file
prs = Presentation(ppt_path)
# Create PDF using reportlab
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import landscape, letter
c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
for slide_num, slide in enumerate(prs.slides, 1):
# Add slide title
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 550, f"Slide {slide_num}")
# Add slide content
c.setFont("Helvetica", 12)
y_position = 500
# Extract text from slide
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
text_lines = shape.text.split('\n')
for line in text_lines:
if line.strip():
c.drawString(70, y_position, line[:100])
y_position -= 20
# Add new page (except for last page)
if slide_num < len(prs.slides):
c.showPage()
c.save()
logger.info(f"PPT to PDF conversion complete: {pdf_path}")
return True
except Exception as e:
logger.error(f"PPT to PDF conversion failed: {str(e)}")
return False
class FileConverter:
"""Main file converter class"""
def __init__(self):
self.pdf_to_office = PDFToOffice()
self.office_to_pdf = OfficeToPDF()
def convert(self, input_path: str, output_path: str, conversion_type: str) -> bool:
"""
Perform file conversion
Args:
input_path: Path to input file
output_path: Path to output file
conversion_type: Conversion type, supports:
'pdf2word', 'pdf2excel', 'pdf2ppt',
'word2pdf', 'excel2pdf', 'ppt2pdf'
Returns:
bool: Whether conversion was successful
"""
# Check if input file exists
if not os.path.exists(input_path):
logger.error(f"Input file does not exist: {input_path}")
return False
# Create output directory
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# Execute conversion based on type
converters = {
'pdf2word': self.pdf_to_office.pdf_to_word,
'pdf2excel': self.pdf_to_office.pdf_to_excel,
'pdf2ppt': self.pdf_to_office.pdf_to_ppt,
'word2pdf': self.office_to_pdf.word_to_pdf,
'excel2pdf': self.office_to_pdf.excel_to_pdf,
'ppt2pdf': self.office_to_pdf.ppt_to_pdf
}
if conversion_type not in converters:
logger.error(f"Unsupported conversion type: {conversion_type}")
return False
return converters[conversion_type](input_path, output_path)
def main():
"""Main function"""
converter = FileConverter()
# Example usage
examples = [
# PDF to Office
("example.pdf", "output.docx", "pdf2word"),
("example.pdf", "output.xlsx", "pdf2excel"),
("example.pdf", "output.pptx", "pdf2ppt"),
# Office to PDF
("example.docx", "output.pdf", "word2pdf"),
("example.xlsx", "output.pdf", "excel2pdf"),
("example.pptx", "output.pdf", "ppt2pdf"),
]
# Print usage instructions
print("=" * 60)
print("PDF/Office Conversion Tool")
print("=" * 60)
print("\nSupported conversion types:")
print(" 1. PDF → Word (pdf2word)")
print(" 2. PDF → Excel (pdf2excel)")
print(" 3. PDF → PPT (pdf2ppt)")
print(" 4. Word → PDF (word2pdf)")
print(" 5. Excel → PDF (excel2pdf)")
print(" 6. PPT → PDF (ppt2pdf)")
print("\nUsage examples:")
print(" converter.convert('input.pdf', 'output.docx', 'pdf2word')")
print(" converter.convert('input.docx', 'output.pdf', 'word2pdf')")
print("=" * 60)
# Prompt user for input
input_file = input("\nEnter input file path: ").strip()
output_file = input("Enter output file path: ").strip()
print("Select conversion type:")
print("1. PDF to Word")
print("2. PDF to Excel")
print("3. PDF to PPT")
print("4. Word to PDF")
print("5. Excel to PDF")
print("6. PPT to PDF")
choice = input("Enter number (1-6): ").strip()
type_map = {
'1': 'pdf2word',
'2': 'pdf2excel',
'3': 'pdf2ppt',
'4': 'word2pdf',
'5': 'excel2pdf',
'6': 'ppt2pdf'
}
if choice in type_map:
success = converter.convert(input_file, output_file, type_map[choice])
if success:
print(f"\n✅ Conversion successful! Output file: {output_file}")
else:
print(f"\n❌ Conversion failed, please check the logs.")
else:
print("Invalid choice!")
if __name__ == "__main__":
main()3. Enhanced Converter (Using More Libraries)
If you need higher quality conversion, install these enhanced libraries:
# Better PDF processing
pip install pdf2docx==0.5.6 # PDF to Word
pip install camelot-py==0.11.0 # PDF table extraction
pip install tabula-py==2.8.0 # PDF table extraction
# Better Office to PDF
pip install comtypes==1.1.14 # Windows Office COM component usage
pip install win32com # Windows automationEnhanced code example:
"""
Enhanced Converter - Using More Specialized Libraries
"""
import os
import sys
from pathlib import Path
# Enhanced PDF to Word
try:
from pdf2docx import Converter
except ImportError:
print("Please install pdf2docx: pip install pdf2docx")
# Enhanced PDF table extraction
try:
import camelot
except ImportError:
print("Please install camelot-py: pip install camelot-py")
# Windows Office COM component usage
if sys.platform == 'win32':
try:
import win32com.client
except ImportError:
print("Please install pywin32: pip install pywin32")
class EnhancedConverter:
"""Enhanced converter class"""
@staticmethod
def pdf_to_word_enhanced(pdf_path: str, word_path: str) -> bool:
"""
High-quality PDF to Word conversion using pdf2docx
"""
try:
cv = Converter(pdf_path)
cv.convert(word_path, start=0, end=None)
cv.close()
return True
except Exception as e:
print(f"Conversion failed: {e}")
return False
@staticmethod
def word_to_pdf_enhanced(word_path: str, pdf_path: str) -> bool:
"""
High-quality conversion using Word COM component on Windows
"""
if sys.platform != 'win32':
print("This feature is only supported on Windows systems")
return False
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
doc = word.Documents.Open(os.path.abspath(word_path))
doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17) # 17 = PDF format
doc.Close()
word.Quit()
return True
except Exception as e:
print(f"Conversion failed: {e}")
return False
# Usage example
if __name__ == "__main__":
converter = EnhancedConverter()
# PDF to Word
converter.pdf_to_word_enhanced("input.pdf", "output.docx")
# Word to PDF (Windows)
converter.word_to_pdf_enhanced("input.docx", "output.pdf")4. Batch Conversion Tool
"""
Batch File Conversion Tool
"""
import os
import glob
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
class BatchConverter:
"""Batch converter class"""
def __init__(self, converter):
self.converter = converter
def batch_convert(self, input_dir: str, output_dir: str,
input_ext: str, output_ext: str, conversion_type: str):
"""
Batch convert all files in a folder
Args:
input_dir: Input folder
output_dir: Output folder
input_ext: Input file extension (e.g., '.pdf')
output_ext: Output file extension (e.g., '.docx')
conversion_type: Conversion type
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Get all input files
input_files = glob.glob(os.path.join(input_dir, f"*{input_ext}"))
if not input_files:
print(f"No {input_ext} files found in {input_dir}")
return
print(f"Found {len(input_files)} files, starting conversion...")
# Use thread pool for parallel conversion
with ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for input_file in input_files:
# Generate output filename
base_name = Path(input_file).stem
output_file = os.path.join(output_dir, f"{base_name}{output_ext}")
# Submit conversion task
future = executor.submit(
self.converter.convert,
input_file,
output_file,
conversion_type
)
futures.append((future, input_file, output_file))
# Wait for all tasks to complete
for future, input_file, output_file in futures:
try:
success = future.result()
status = "✓" if success else "✗"
print(f"[{status}] {Path(input_file).name} -> {Path(output_file).name}")
except Exception as e:
print(f"[✗] {Path(input_file).name} conversion failed: {e}")
# Usage example
if __name__ == "__main__":
from main import FileConverter
converter = FileConverter()
batch = BatchConverter(converter)
# Batch convert PDF to Word
batch.batch_convert(
input_dir="./pdf_files",
output_dir="./word_files",
input_ext=".pdf",
output_ext=".docx",
conversion_type="pdf2word"
)
# Batch convert Word to PDF
batch.batch_convert(
input_dir="./word_files",
output_dir="./pdf_output",
input_ext=".docx",
output_ext=".pdf",
conversion_type="word2pdf"
)5. Usage Instructions
Install Dependencies:
pip install PyPDF2 pdf2image pdfplumber python-docx openpyxl python-pptx pillowBasic Usage:
from converter import FileConverter converter = FileConverter() # PDF to Word converter.convert("input.pdf", "output.docx", "pdf2word") # Word to PDF converter.convert("input.docx", "output.pdf", "word2pdf")Important Notes:
- These libraries are all free and open-source
- Conversion quality may not match commercial software
- For complex document formats, consider using the enhanced version or commercial solutions
- When processing Chinese text, you may need to install Chinese fonts