PDF/Office互转工具 Python教程
13 min
| 笔记
PDF/Office互转工具 Python教程
1. 环境准备
首先安装必要的库:
# PDF操作相关库
pip install PyPDF2==3.0.0
pip install pdf2image==1.16.0
pip install pdfplumber==0.10.0
# Office操作相关库
pip install python-docx==1.1.0
pip install openpyxl==3.1.2
pip install python-pptx==0.6.21
# 图像处理
pip install pillow==10.0.0
# 用于PDF转图片和OCR
pip install pytesseract==0.3.102. 完整的转换工具代码
"""
PDF/Office 互转换工具
支持格式:PDF, Word, Excel, PPT
使用免费开源库实现
"""
import os
import sys
from pathlib import Path
import logging
from typing import Union, Optional
# PDF处理
import PyPDF2
from pdf2image import convert_from_path
import pdfplumber
# Office处理
from docx import Document
from docx.shared import Inches
import openpyxl
from openpyxl import load_workbook
from pptx import Presentation
from PIL import Image
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class PDFToOffice:
"""PDF转Office工具类"""
@staticmethod
def pdf_to_word(pdf_path: str, word_path: str) -> bool:
"""
将PDF转换为Word文档
Args:
pdf_path: PDF文件路径
word_path: 输出Word文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换PDF到Word: {pdf_path}")
# 创建Word文档
doc = Document()
# 使用pdfplumber提取文本
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# 添加页码标题
doc.add_heading(f'第 {page_num} 页', level=2)
# 提取文本
text = page.extract_text()
if text:
doc.add_paragraph(text)
# 提取表格
tables = page.extract_tables()
for table in tables:
if table:
# 在Word中创建表格
word_table = doc.add_table(rows=len(table), cols=len(table[0]))
for i, row in enumerate(table):
for j, cell in enumerate(row):
if cell:
word_table.cell(i, j).text = str(cell)
doc.add_paragraph() # 添加空行
# 分页
if page_num < len(pdf.pages):
doc.add_page_break()
# 保存Word文档
doc.save(word_path)
logger.info(f"PDF转换Word完成: {word_path}")
return True
except Exception as e:
logger.error(f"PDF转Word失败: {str(e)}")
return False
@staticmethod
def pdf_to_excel(pdf_path: str, excel_path: str) -> bool:
"""
将PDF转换为Excel(主要提取表格)
Args:
pdf_path: PDF文件路径
excel_path: 输出Excel文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换PDF到Excel: {pdf_path}")
# 创建Excel工作簿
wb = openpyxl.Workbook()
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# 为每页创建一个工作表
ws = wb.create_sheet(title=f"第{page_num}页")
# 提取表格
tables = page.extract_tables()
if tables:
for table in tables:
for row_idx, row in enumerate(table, 1):
for col_idx, cell in enumerate(row, 1):
if cell:
ws.cell(row=row_idx, column=col_idx, value=str(cell))
# 提取文本
text = page.extract_text()
if text:
ws.cell(row=len(tables) + 2, column=1, value="提取的文本:")
ws.cell(row=len(tables) + 3, column=1, value=text)
# 删除默认的工作表
if "Sheet" in wb.sheetnames:
wb.remove(wb["Sheet"])
wb.save(excel_path)
logger.info(f"PDF转换Excel完成: {excel_path}")
return True
except Exception as e:
logger.error(f"PDF转Excel失败: {str(e)}")
return False
@staticmethod
def pdf_to_ppt(pdf_path: str, ppt_path: str) -> bool:
"""
将PDF转换为PPT(每页作为一张幻灯片)
Args:
pdf_path: PDF文件路径
ppt_path: 输出PPT文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换PDF到PPT: {pdf_path}")
# 创建PPT
prs = Presentation()
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# 添加幻灯片
slide_layout = prs.slide_layouts[5] # 空白布局
slide = prs.slides.add_slide(slide_layout)
# 添加标题
title = slide.shapes.title
if title:
title.text = f"第 {page_num} 页"
# 添加文本内容
text = page.extract_text()
if text:
textbox = slide.shapes.add_textbox(
left=Inches(1), top=Inches(1.5),
width=Inches(8), height=Inches(5)
)
text_frame = textbox.text_frame
text_frame.text = text[:500] # 限制文本长度
prs.save(ppt_path)
logger.info(f"PDF转换PPT完成: {ppt_path}")
return True
except Exception as e:
logger.error(f"PDF转PPT失败: {str(e)}")
return False
class OfficeToPDF:
"""Office转PDF工具类"""
@staticmethod
def word_to_pdf(word_path: str, pdf_path: str) -> bool:
"""
将Word转换为PDF(通过打印方式)
Args:
word_path: Word文件路径
pdf_path: 输出PDF文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换Word到PDF: {word_path}")
# 读取Word文档
doc = Document(word_path)
# 创建PDF写入器
pdf_writer = PyPDF2.PdfWriter()
# 由于PyPDF2不支持直接写入文本,这里创建一个简单的PDF
# 实际使用时可以考虑使用reportlab等库创建更复杂的PDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# 创建临时PDF
temp_pdf = "temp_word_output.pdf"
c = canvas.Canvas(temp_pdf, pagesize=letter)
y_position = 750 # 起始Y位置
line_height = 15 # 行高
# 添加文档内容
for paragraph in doc.paragraphs:
if paragraph.text:
# 处理换行
lines = paragraph.text.split('\n')
for line in lines:
if line.strip():
c.drawString(50, y_position, line[:100]) # 限制每行长度
y_position -= line_height
# 如果页面已满,创建新页面
if y_position < 50:
c.showPage()
y_position = 750
c.save()
# 读取临时PDF并添加到最终PDF
with open(temp_pdf, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
for page in pdf.pages:
pdf_writer.add_page(page)
# 保存最终PDF
with open(pdf_path, 'wb') as f:
pdf_writer.write(f)
# 删除临时文件
os.remove(temp_pdf)
logger.info(f"Word转换PDF完成: {pdf_path}")
return True
except Exception as e:
logger.error(f"Word转PDF失败: {str(e)}")
return False
@staticmethod
def excel_to_pdf(excel_path: str, pdf_path: str) -> bool:
"""
将Excel转换为PDF
Args:
excel_path: Excel文件路径
pdf_path: 输出PDF文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换Excel到PDF: {excel_path}")
# 读取Excel文件
wb = load_workbook(excel_path)
# 使用reportlab创建PDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import landscape, letter
c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
y_position = 550
line_height = 20
# 遍历每个工作表
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
# 添加工作表标题
c.setFont("Helvetica-Bold", 14)
c.drawString(50, y_position, f"工作表: {sheet_name}")
y_position -= line_height + 10
# 设置表格字体
c.setFont("Helvetica", 10)
# 获取数据范围
data = []
for row in ws.iter_rows(values_only=True):
data.append([str(cell) if cell is not None else "" for cell in row])
# 绘制表格
if data:
col_width = 80
row_height = 15
for row_idx, row in enumerate(data):
for col_idx, cell in enumerate(row[:8]): # 限制列数
x = 50 + col_idx * col_width
y = y_position - row_idx * row_height
# 绘制单元格边框
c.rect(x, y - row_height, col_width, row_height)
# 写入单元格内容
c.drawString(x + 5, y - row_height + 5, cell[:15]) # 限制文本长度
# 如果超出页面,创建新页面
if y_position - (row_idx + 1) * row_height < 50:
c.showPage()
y_position = 750
c.setFont("Helvetica", 10)
y_position -= len(data) * row_height + 30
# 如果需要新页面
if y_position < 100:
c.showPage()
y_position = 750
c.save()
logger.info(f"Excel转换PDF完成: {pdf_path}")
return True
except Exception as e:
logger.error(f"Excel转PDF失败: {str(e)}")
return False
@staticmethod
def ppt_to_pdf(ppt_path: str, pdf_path: str) -> bool:
"""
将PPT转换为PDF
Args:
ppt_path: PPT文件路径
pdf_path: 输出PDF文件路径
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"开始转换PPT到PDF: {ppt_path}")
# 读取PPT文件
prs = Presentation(ppt_path)
# 使用reportlab创建PDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import landscape, letter
c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
for slide_num, slide in enumerate(prs.slides, 1):
# 添加幻灯片标题
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 550, f"幻灯片 {slide_num}")
# 添加幻灯片内容
c.setFont("Helvetica", 12)
y_position = 500
# 提取幻灯片中的文本
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
text_lines = shape.text.split('\n')
for line in text_lines:
if line.strip():
c.drawString(70, y_position, line[:100])
y_position -= 20
# 添加新页面(除了最后一页)
if slide_num < len(prs.slides):
c.showPage()
c.save()
logger.info(f"PPT转换PDF完成: {pdf_path}")
return True
except Exception as e:
logger.error(f"PPT转PDF失败: {str(e)}")
return False
class FileConverter:
"""文件转换主类"""
def __init__(self):
self.pdf_to_office = PDFToOffice()
self.office_to_pdf = OfficeToPDF()
def convert(self, input_path: str, output_path: str, conversion_type: str) -> bool:
"""
执行文件转换
Args:
input_path: 输入文件路径
output_path: 输出文件路径
conversion_type: 转换类型,支持:
'pdf2word', 'pdf2excel', 'pdf2ppt',
'word2pdf', 'excel2pdf', 'ppt2pdf'
Returns:
bool: 转换是否成功
"""
# 检查输入文件是否存在
if not os.path.exists(input_path):
logger.error(f"输入文件不存在: {input_path}")
return False
# 创建输出目录
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# 根据转换类型执行相应转换
converters = {
'pdf2word': self.pdf_to_office.pdf_to_word,
'pdf2excel': self.pdf_to_office.pdf_to_excel,
'pdf2ppt': self.pdf_to_office.pdf_to_ppt,
'word2pdf': self.office_to_pdf.word_to_pdf,
'excel2pdf': self.office_to_pdf.excel_to_pdf,
'ppt2pdf': self.office_to_pdf.ppt_to_pdf
}
if conversion_type not in converters:
logger.error(f"不支持的转换类型: {conversion_type}")
return False
return converters[conversion_type](input_path, output_path)
def main():
"""主函数"""
converter = FileConverter()
# 示例用法
examples = [
# PDF转Office
("example.pdf", "output.docx", "pdf2word"),
("example.pdf", "output.xlsx", "pdf2excel"),
("example.pdf", "output.pptx", "pdf2ppt"),
# Office转PDF
("example.docx", "output.pdf", "word2pdf"),
("example.xlsx", "output.pdf", "excel2pdf"),
("example.pptx", "output.pdf", "ppt2pdf"),
]
# 打印使用说明
print("=" * 60)
print("PDF/Office 互转换工具")
print("=" * 60)
print("\n支持的转换类型:")
print(" 1. PDF → Word (pdf2word)")
print(" 2. PDF → Excel (pdf2excel)")
print(" 3. PDF → PPT (pdf2ppt)")
print(" 4. Word → PDF (word2pdf)")
print(" 5. Excel → PDF (excel2pdf)")
print(" 6. PPT → PDF (ppt2pdf)")
print("\n使用示例:")
print(" converter.convert('input.pdf', 'output.docx', 'pdf2word')")
print(" converter.convert('input.docx', 'output.pdf', 'word2pdf')")
print("=" * 60)
# 提示用户输入
input_file = input("\n请输入输入文件路径: ").strip()
output_file = input("请输入输出文件路径: ").strip()
print("请选择转换类型:")
print("1. PDF转Word")
print("2. PDF转Excel")
print("3. PDF转PPT")
print("4. Word转PDF")
print("5. Excel转PDF")
print("6. PPT转PDF")
choice = input("请输入数字(1-6): ").strip()
type_map = {
'1': 'pdf2word',
'2': 'pdf2excel',
'3': 'pdf2ppt',
'4': 'word2pdf',
'5': 'excel2pdf',
'6': 'ppt2pdf'
}
if choice in type_map:
success = converter.convert(input_file, output_file, type_map[choice])
if success:
print(f"\n✅ 转换成功!输出文件: {output_file}")
else:
print(f"\n❌ 转换失败,请检查日志。")
else:
print("无效的选择!")
if __name__ == "__main__":
main()3. 增强版转换器(使用更多库)
如果你需要更高质量的转换,可以安装以下增强库:
# 更好的PDF处理
pip install pdf2docx==0.5.6 # PDF转Word
pip install camelot-py==0.11.0 # PDF表格提取
pip install tabula-py==2.8.0 # PDF表格提取
# 更好的Office转PDF
pip install comtypes==1.1.14 # Windows下使用Office COM组件
pip install win32com # Windows自动化增强版代码示例:
"""
增强版转换器 - 使用更多专业库
"""
import os
import sys
from pathlib import Path
# 增强版PDF转Word
try:
from pdf2docx import Converter
except ImportError:
print("请安装pdf2docx: pip install pdf2docx")
# 增强版PDF表格提取
try:
import camelot
except ImportError:
print("请安装camelot-py: pip install camelot-py")
# Windows下使用Office COM组件
if sys.platform == 'win32':
try:
import win32com.client
except ImportError:
print("请安装pywin32: pip install pywin32")
class EnhancedConverter:
"""增强版转换器"""
@staticmethod
def pdf_to_word_enhanced(pdf_path: str, word_path: str) -> bool:
"""
使用pdf2docx进行高质量PDF转Word
"""
try:
cv = Converter(pdf_path)
cv.convert(word_path, start=0, end=None)
cv.close()
return True
except Exception as e:
print(f"转换失败: {e}")
return False
@staticmethod
def word_to_pdf_enhanced(word_path: str, pdf_path: str) -> bool:
"""
Windows下使用Word COM组件进行高质量转换
"""
if sys.platform != 'win32':
print("此功能仅支持Windows系统")
return False
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
doc = word.Documents.Open(os.path.abspath(word_path))
doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17) # 17 = PDF格式
doc.Close()
word.Quit()
return True
except Exception as e:
print(f"转换失败: {e}")
return False
# 使用示例
if __name__ == "__main__":
converter = EnhancedConverter()
# PDF转Word
converter.pdf_to_word_enhanced("input.pdf", "output.docx")
# Word转PDF (Windows)
converter.word_to_pdf_enhanced("input.docx", "output.pdf")4. 批量转换工具
"""
批量文件转换工具
"""
import os
import glob
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
class BatchConverter:
"""批量转换器"""
def __init__(self, converter):
self.converter = converter
def batch_convert(self, input_dir: str, output_dir: str,
input_ext: str, output_ext: str, conversion_type: str):
"""
批量转换文件夹中的所有文件
Args:
input_dir: 输入文件夹
output_dir: 输出文件夹
input_ext: 输入文件扩展名 (如 '.pdf')
output_ext: 输出文件扩展名 (如 '.docx')
conversion_type: 转换类型
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 获取所有输入文件
input_files = glob.glob(os.path.join(input_dir, f"*{input_ext}"))
if not input_files:
print(f"在 {input_dir} 中没有找到 {input_ext} 文件")
return
print(f"找到 {len(input_files)} 个文件,开始转换...")
# 使用线程池进行并行转换
with ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for input_file in input_files:
# 生成输出文件名
base_name = Path(input_file).stem
output_file = os.path.join(output_dir, f"{base_name}{output_ext}")
# 提交转换任务
future = executor.submit(
self.converter.convert,
input_file,
output_file,
conversion_type
)
futures.append((future, input_file, output_file))
# 等待所有任务完成
for future, input_file, output_file in futures:
try:
success = future.result()
status = "✓" if success else "✗"
print(f"[{status}] {Path(input_file).name} -> {Path(output_file).name}")
except Exception as e:
print(f"[✗] {Path(input_file).name} 转换失败: {e}")
# 使用示例
if __name__ == "__main__":
from main import FileConverter
converter = FileConverter()
batch = BatchConverter(converter)
# 批量转换PDF到Word
batch.batch_convert(
input_dir="./pdf_files",
output_dir="./word_files",
input_ext=".pdf",
output_ext=".docx",
conversion_type="pdf2word"
)
# 批量转换Word到PDF
batch.batch_convert(
input_dir="./word_files",
output_dir="./pdf_output",
input_ext=".docx",
output_ext=".pdf",
conversion_type="word2pdf"
)5. 使用说明
安装依赖:
pip install PyPDF2 pdf2image pdfplumber python-docx openpyxl python-pptx pillow基本使用:
from converter import FileConverter converter = FileConverter() # PDF转Word converter.convert("input.pdf", "output.docx", "pdf2word") # Word转PDF converter.convert("input.docx", "output.pdf", "word2pdf")注意事项:
- 这些库都是免费开源的
- 转换质量可能不如商业软件
- 对于复杂格式的文档,可能需要使用增强版或商业解决方案
- 处理中文时可能需要安装中文字体