PowerPointからテキスト抽出

ライブラリpython-pptx

# プロパティ情報を取得
import pptx

prs = pptx.Presentation('book-proposal1.pptx')
properties = prs.core_properties

print(f'作成者: {properties.author}')
print(f'カテゴリ: {properties.category}')
print(f'コメント: {properties.comments}')
print(f'キーワード: {properties.keywords}')
print(f'最終編集者: {properties.last_modified_by}')
print(f'件名: {properties.subject}')
print(f'タイトル: {properties.title}')
print(f'文書の作成日時: {properties.created}')
print(f'文書の最終印刷日時: {properties.last_printed}')
print(f'文書の編集日時: {properties.modified}')

# ファイルのテキスト情報を取得
import pptx

prs = pptx.Presentation('book-proposal1.pptx')
for i, slide in enumerate(prs.slides):
    print(f'=' * 72)
    print(f'スライド: {i}')
    for shape in slide.shapes:
        if not shape.has_text_frame:
            continue

        for paragraph in shape.text_frame.paragraphs:
            print(paragraph.text)

月	火	水	木	金	土	日
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31