PDF/Office互转工具 Python教程

13 min

PDF/Office互转工具 Python教程

1. 环境准备

首先安装必要的库:

# PDF操作相关库
pip install PyPDF2==3.0.0
pip install pdf2image==1.16.0
pip install pdfplumber==0.10.0

# Office操作相关库
pip install python-docx==1.1.0
pip install openpyxl==3.1.2
pip install python-pptx==0.6.21

# 图像处理
pip install pillow==10.0.0

# 用于PDF转图片和OCR
pip install pytesseract==0.3.10

2. 完整的转换工具代码

"""
PDF/Office 互转换工具
支持格式:PDF, Word, Excel, PPT
使用免费开源库实现
"""

import os
import sys
from pathlib import Path
import logging
from typing import Union, Optional

# PDF处理
import PyPDF2
from pdf2image import convert_from_path
import pdfplumber

# Office处理
from docx import Document
from docx.shared import Inches
import openpyxl
from openpyxl import load_workbook
from pptx import Presentation
from PIL import Image

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class PDFToOffice:
    """PDF转Office工具类"""
    
    @staticmethod
    def pdf_to_word(pdf_path: str, word_path: str) -> bool:
        """
        将PDF转换为Word文档
        
        Args:
            pdf_path: PDF文件路径
            word_path: 输出Word文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换PDF到Word: {pdf_path}")
            
            # 创建Word文档
            doc = Document()
            
            # 使用pdfplumber提取文本
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # 添加页码标题
                    doc.add_heading(f'第 {page_num} 页', level=2)
                    
                    # 提取文本
                    text = page.extract_text()
                    if text:
                        doc.add_paragraph(text)
                    
                    # 提取表格
                    tables = page.extract_tables()
                    for table in tables:
                        if table:
                            # 在Word中创建表格
                            word_table = doc.add_table(rows=len(table), cols=len(table[0]))
                            for i, row in enumerate(table):
                                for j, cell in enumerate(row):
                                    if cell:
                                        word_table.cell(i, j).text = str(cell)
                            doc.add_paragraph()  # 添加空行
                    
                    # 分页
                    if page_num < len(pdf.pages):
                        doc.add_page_break()
            
            # 保存Word文档
            doc.save(word_path)
            logger.info(f"PDF转换Word完成: {word_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF转Word失败: {str(e)}")
            return False
    
    @staticmethod
    def pdf_to_excel(pdf_path: str, excel_path: str) -> bool:
        """
        将PDF转换为Excel(主要提取表格)
        
        Args:
            pdf_path: PDF文件路径
            excel_path: 输出Excel文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换PDF到Excel: {pdf_path}")
            
            # 创建Excel工作簿
            wb = openpyxl.Workbook()
            
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # 为每页创建一个工作表
                    ws = wb.create_sheet(title=f"第{page_num}页")
                    
                    # 提取表格
                    tables = page.extract_tables()
                    if tables:
                        for table in tables:
                            for row_idx, row in enumerate(table, 1):
                                for col_idx, cell in enumerate(row, 1):
                                    if cell:
                                        ws.cell(row=row_idx, column=col_idx, value=str(cell))
                    
                    # 提取文本
                    text = page.extract_text()
                    if text:
                        ws.cell(row=len(tables) + 2, column=1, value="提取的文本:")
                        ws.cell(row=len(tables) + 3, column=1, value=text)
            
            # 删除默认的工作表
            if "Sheet" in wb.sheetnames:
                wb.remove(wb["Sheet"])
            
            wb.save(excel_path)
            logger.info(f"PDF转换Excel完成: {excel_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF转Excel失败: {str(e)}")
            return False
    
    @staticmethod
    def pdf_to_ppt(pdf_path: str, ppt_path: str) -> bool:
        """
        将PDF转换为PPT(每页作为一张幻灯片)
        
        Args:
            pdf_path: PDF文件路径
            ppt_path: 输出PPT文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换PDF到PPT: {pdf_path}")
            
            # 创建PPT
            prs = Presentation()
            
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # 添加幻灯片
                    slide_layout = prs.slide_layouts[5]  # 空白布局
                    slide = prs.slides.add_slide(slide_layout)
                    
                    # 添加标题
                    title = slide.shapes.title
                    if title:
                        title.text = f"第 {page_num} 页"
                    
                    # 添加文本内容
                    text = page.extract_text()
                    if text:
                        textbox = slide.shapes.add_textbox(
                            left=Inches(1), top=Inches(1.5),
                            width=Inches(8), height=Inches(5)
                        )
                        text_frame = textbox.text_frame
                        text_frame.text = text[:500]  # 限制文本长度
            
            prs.save(ppt_path)
            logger.info(f"PDF转换PPT完成: {ppt_path}")
            return True
            
        except Exception as e:
            logger.error(f"PDF转PPT失败: {str(e)}")
            return False


class OfficeToPDF:
    """Office转PDF工具类"""
    
    @staticmethod
    def word_to_pdf(word_path: str, pdf_path: str) -> bool:
        """
        将Word转换为PDF(通过打印方式)
        
        Args:
            word_path: Word文件路径
            pdf_path: 输出PDF文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换Word到PDF: {word_path}")
            
            # 读取Word文档
            doc = Document(word_path)
            
            # 创建PDF写入器
            pdf_writer = PyPDF2.PdfWriter()
            
            # 由于PyPDF2不支持直接写入文本,这里创建一个简单的PDF
            # 实际使用时可以考虑使用reportlab等库创建更复杂的PDF
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import letter
            from reportlab.pdfbase import pdfmetrics
            from reportlab.pdfbase.ttfonts import TTFont
            
            # 创建临时PDF
            temp_pdf = "temp_word_output.pdf"
            c = canvas.Canvas(temp_pdf, pagesize=letter)
            
            y_position = 750  # 起始Y位置
            line_height = 15  # 行高
            
            # 添加文档内容
            for paragraph in doc.paragraphs:
                if paragraph.text:
                    # 处理换行
                    lines = paragraph.text.split('\n')
                    for line in lines:
                        if line.strip():
                            c.drawString(50, y_position, line[:100])  # 限制每行长度
                            y_position -= line_height
                            
                            # 如果页面已满,创建新页面
                            if y_position < 50:
                                c.showPage()
                                y_position = 750
            
            c.save()
            
            # 读取临时PDF并添加到最终PDF
            with open(temp_pdf, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages:
                    pdf_writer.add_page(page)
            
            # 保存最终PDF
            with open(pdf_path, 'wb') as f:
                pdf_writer.write(f)
            
            # 删除临时文件
            os.remove(temp_pdf)
            
            logger.info(f"Word转换PDF完成: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"Word转PDF失败: {str(e)}")
            return False
    
    @staticmethod
    def excel_to_pdf(excel_path: str, pdf_path: str) -> bool:
        """
        将Excel转换为PDF
        
        Args:
            excel_path: Excel文件路径
            pdf_path: 输出PDF文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换Excel到PDF: {excel_path}")
            
            # 读取Excel文件
            wb = load_workbook(excel_path)
            
            # 使用reportlab创建PDF
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import landscape, letter
            
            c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
            
            y_position = 550
            line_height = 20
            
            # 遍历每个工作表
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                
                # 添加工作表标题
                c.setFont("Helvetica-Bold", 14)
                c.drawString(50, y_position, f"工作表: {sheet_name}")
                y_position -= line_height + 10
                
                # 设置表格字体
                c.setFont("Helvetica", 10)
                
                # 获取数据范围
                data = []
                for row in ws.iter_rows(values_only=True):
                    data.append([str(cell) if cell is not None else "" for cell in row])
                
                # 绘制表格
                if data:
                    col_width = 80
                    row_height = 15
                    
                    for row_idx, row in enumerate(data):
                        for col_idx, cell in enumerate(row[:8]):  # 限制列数
                            x = 50 + col_idx * col_width
                            y = y_position - row_idx * row_height
                            
                            # 绘制单元格边框
                            c.rect(x, y - row_height, col_width, row_height)
                            
                            # 写入单元格内容
                            c.drawString(x + 5, y - row_height + 5, cell[:15])  # 限制文本长度
                        
                        # 如果超出页面,创建新页面
                        if y_position - (row_idx + 1) * row_height < 50:
                            c.showPage()
                            y_position = 750
                            c.setFont("Helvetica", 10)
                
                y_position -= len(data) * row_height + 30
                
                # 如果需要新页面
                if y_position < 100:
                    c.showPage()
                    y_position = 750
            
            c.save()
            
            logger.info(f"Excel转换PDF完成: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"Excel转PDF失败: {str(e)}")
            return False
    
    @staticmethod
    def ppt_to_pdf(ppt_path: str, pdf_path: str) -> bool:
        """
        将PPT转换为PDF
        
        Args:
            ppt_path: PPT文件路径
            pdf_path: 输出PDF文件路径
            
        Returns:
            bool: 转换是否成功
        """
        try:
            logger.info(f"开始转换PPT到PDF: {ppt_path}")
            
            # 读取PPT文件
            prs = Presentation(ppt_path)
            
            # 使用reportlab创建PDF
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import landscape, letter
            
            c = canvas.Canvas(pdf_path, pagesize=landscape(letter))
            
            for slide_num, slide in enumerate(prs.slides, 1):
                # 添加幻灯片标题
                c.setFont("Helvetica-Bold", 16)
                c.drawString(50, 550, f"幻灯片 {slide_num}")
                
                # 添加幻灯片内容
                c.setFont("Helvetica", 12)
                y_position = 500
                
                # 提取幻灯片中的文本
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        text_lines = shape.text.split('\n')
                        for line in text_lines:
                            if line.strip():
                                c.drawString(70, y_position, line[:100])
                                y_position -= 20
                
                # 添加新页面(除了最后一页)
                if slide_num < len(prs.slides):
                    c.showPage()
            
            c.save()
            
            logger.info(f"PPT转换PDF完成: {pdf_path}")
            return True
            
        except Exception as e:
            logger.error(f"PPT转PDF失败: {str(e)}")
            return False


class FileConverter:
    """文件转换主类"""
    
    def __init__(self):
        self.pdf_to_office = PDFToOffice()
        self.office_to_pdf = OfficeToPDF()
    
    def convert(self, input_path: str, output_path: str, conversion_type: str) -> bool:
        """
        执行文件转换
        
        Args:
            input_path: 输入文件路径
            output_path: 输出文件路径
            conversion_type: 转换类型,支持:
                'pdf2word', 'pdf2excel', 'pdf2ppt',
                'word2pdf', 'excel2pdf', 'ppt2pdf'
        
        Returns:
            bool: 转换是否成功
        """
        # 检查输入文件是否存在
        if not os.path.exists(input_path):
            logger.error(f"输入文件不存在: {input_path}")
            return False
        
        # 创建输出目录
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # 根据转换类型执行相应转换
        converters = {
            'pdf2word': self.pdf_to_office.pdf_to_word,
            'pdf2excel': self.pdf_to_office.pdf_to_excel,
            'pdf2ppt': self.pdf_to_office.pdf_to_ppt,
            'word2pdf': self.office_to_pdf.word_to_pdf,
            'excel2pdf': self.office_to_pdf.excel_to_pdf,
            'ppt2pdf': self.office_to_pdf.ppt_to_pdf
        }
        
        if conversion_type not in converters:
            logger.error(f"不支持的转换类型: {conversion_type}")
            return False
        
        return converters[conversion_type](input_path, output_path)


def main():
    """主函数"""
    converter = FileConverter()
    
    # 示例用法
    examples = [
        # PDF转Office
        ("example.pdf", "output.docx", "pdf2word"),
        ("example.pdf", "output.xlsx", "pdf2excel"),
        ("example.pdf", "output.pptx", "pdf2ppt"),
        
        # Office转PDF
        ("example.docx", "output.pdf", "word2pdf"),
        ("example.xlsx", "output.pdf", "excel2pdf"),
        ("example.pptx", "output.pdf", "ppt2pdf"),
    ]
    
    # 打印使用说明
    print("=" * 60)
    print("PDF/Office 互转换工具")
    print("=" * 60)
    print("\n支持的转换类型:")
    print("  1. PDF → Word   (pdf2word)")
    print("  2. PDF → Excel  (pdf2excel)")
    print("  3. PDF → PPT    (pdf2ppt)")
    print("  4. Word → PDF   (word2pdf)")
    print("  5. Excel → PDF  (excel2pdf)")
    print("  6. PPT → PDF    (ppt2pdf)")
    print("\n使用示例:")
    print("  converter.convert('input.pdf', 'output.docx', 'pdf2word')")
    print("  converter.convert('input.docx', 'output.pdf', 'word2pdf')")
    print("=" * 60)
    
    # 提示用户输入
    input_file = input("\n请输入输入文件路径: ").strip()
    output_file = input("请输入输出文件路径: ").strip()
    print("请选择转换类型:")
    print("1. PDF转Word")
    print("2. PDF转Excel")
    print("3. PDF转PPT")
    print("4. Word转PDF")
    print("5. Excel转PDF")
    print("6. PPT转PDF")
    
    choice = input("请输入数字(1-6): ").strip()
    
    type_map = {
        '1': 'pdf2word',
        '2': 'pdf2excel',
        '3': 'pdf2ppt',
        '4': 'word2pdf',
        '5': 'excel2pdf',
        '6': 'ppt2pdf'
    }
    
    if choice in type_map:
        success = converter.convert(input_file, output_file, type_map[choice])
        if success:
            print(f"\n✅ 转换成功!输出文件: {output_file}")
        else:
            print(f"\n❌ 转换失败,请检查日志。")
    else:
        print("无效的选择!")


if __name__ == "__main__":
    main()

3. 增强版转换器(使用更多库)

如果你需要更高质量的转换,可以安装以下增强库:

# 更好的PDF处理
pip install pdf2docx==0.5.6  # PDF转Word
pip install camelot-py==0.11.0  # PDF表格提取
pip install tabula-py==2.8.0  # PDF表格提取

# 更好的Office转PDF
pip install comtypes==1.1.14  # Windows下使用Office COM组件
pip install win32com  # Windows自动化

增强版代码示例:

"""
增强版转换器 - 使用更多专业库
"""

import os
import sys
from pathlib import Path

# 增强版PDF转Word
try:
    from pdf2docx import Converter
except ImportError:
    print("请安装pdf2docx: pip install pdf2docx")

# 增强版PDF表格提取
try:
    import camelot
except ImportError:
    print("请安装camelot-py: pip install camelot-py")

# Windows下使用Office COM组件
if sys.platform == 'win32':
    try:
        import win32com.client
    except ImportError:
        print("请安装pywin32: pip install pywin32")


class EnhancedConverter:
    """增强版转换器"""
    
    @staticmethod
    def pdf_to_word_enhanced(pdf_path: str, word_path: str) -> bool:
        """
        使用pdf2docx进行高质量PDF转Word
        """
        try:
            cv = Converter(pdf_path)
            cv.convert(word_path, start=0, end=None)
            cv.close()
            return True
        except Exception as e:
            print(f"转换失败: {e}")
            return False
    
    @staticmethod
    def word_to_pdf_enhanced(word_path: str, pdf_path: str) -> bool:
        """
        Windows下使用Word COM组件进行高质量转换
        """
        if sys.platform != 'win32':
            print("此功能仅支持Windows系统")
            return False
        
        try:
            word = win32com.client.Dispatch("Word.Application")
            word.Visible = False
            
            doc = word.Documents.Open(os.path.abspath(word_path))
            doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17)  # 17 = PDF格式
            doc.Close()
            
            word.Quit()
            return True
        except Exception as e:
            print(f"转换失败: {e}")
            return False


# 使用示例
if __name__ == "__main__":
    converter = EnhancedConverter()
    
    # PDF转Word
    converter.pdf_to_word_enhanced("input.pdf", "output.docx")
    
    # Word转PDF (Windows)
    converter.word_to_pdf_enhanced("input.docx", "output.pdf")

4. 批量转换工具

"""
批量文件转换工具
"""

import os
import glob
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed


class BatchConverter:
    """批量转换器"""
    
    def __init__(self, converter):
        self.converter = converter
    
    def batch_convert(self, input_dir: str, output_dir: str, 
                     input_ext: str, output_ext: str, conversion_type: str):
        """
        批量转换文件夹中的所有文件
        
        Args:
            input_dir: 输入文件夹
            output_dir: 输出文件夹
            input_ext: 输入文件扩展名 (如 '.pdf')
            output_ext: 输出文件扩展名 (如 '.docx')
            conversion_type: 转换类型
        """
        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)
        
        # 获取所有输入文件
        input_files = glob.glob(os.path.join(input_dir, f"*{input_ext}"))
        
        if not input_files:
            print(f"在 {input_dir} 中没有找到 {input_ext} 文件")
            return
        
        print(f"找到 {len(input_files)} 个文件,开始转换...")
        
        # 使用线程池进行并行转换
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = []
            
            for input_file in input_files:
                # 生成输出文件名
                base_name = Path(input_file).stem
                output_file = os.path.join(output_dir, f"{base_name}{output_ext}")
                
                # 提交转换任务
                future = executor.submit(
                    self.converter.convert,
                    input_file,
                    output_file,
                    conversion_type
                )
                futures.append((future, input_file, output_file))
            
            # 等待所有任务完成
            for future, input_file, output_file in futures:
                try:
                    success = future.result()
                    status = "✓" if success else "✗"
                    print(f"[{status}] {Path(input_file).name} -> {Path(output_file).name}")
                except Exception as e:
                    print(f"[✗] {Path(input_file).name} 转换失败: {e}")


# 使用示例
if __name__ == "__main__":
    from main import FileConverter
    
    converter = FileConverter()
    batch = BatchConverter(converter)
    
    # 批量转换PDF到Word
    batch.batch_convert(
        input_dir="./pdf_files",
        output_dir="./word_files",
        input_ext=".pdf",
        output_ext=".docx",
        conversion_type="pdf2word"
    )
    
    # 批量转换Word到PDF
    batch.batch_convert(
        input_dir="./word_files",
        output_dir="./pdf_output",
        input_ext=".docx",
        output_ext=".pdf",
        conversion_type="word2pdf"
    )

5. 使用说明

  1. 安装依赖

    pip install PyPDF2 pdf2image pdfplumber python-docx openpyxl python-pptx pillow
  2. 基本使用

    from converter import FileConverter
    
    converter = FileConverter()
    
    # PDF转Word
    converter.convert("input.pdf", "output.docx", "pdf2word")
    
    # Word转PDF
    converter.convert("input.docx", "output.pdf", "word2pdf")
  3. 注意事项

    • 这些库都是免费开源的
    • 转换质量可能不如商业软件
    • 对于复杂格式的文档,可能需要使用增强版或商业解决方案
    • 处理中文时可能需要安装中文字体