Python extracts links (hyperlink) and text from word/docx

created at 07-28-2021 views: 69

Python extracts links (hyperlink) and text from word/docx

import zipfile
import re
import json
import base64
from docx import Document
from os.path import basename
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from bs4 import BeautifulSoup

def get_linked_text(soup):

    links = []

    # This kind of link has a corresponding URL in the _rel file.
    for tag in soup.find_all("hyperlink"):
        # try/except because some hyperlinks have no id.
        try:
            links.append({"id": tag["r:id"], "text": tag.text})
        except:
            pass

    # This kind does not.
    for tag in soup.find_all("instrText"):
        # They're identified by the word HYPERLINK
        if "HYPERLINK" in tag.text:
            # Get the URL. Probably.
            url = tag.text.split('"')[1]

            # The actual linked text is stored nearby tags.
            # Loop through the siblings starting here.
            temp = tag.parent.next_sibling
            text = ""

            while temp is not None:
                # Text comes in <t> tags.
                maybe_text = temp.find("t")
                if maybe_text is not None:
                    # Ones that have text in them.
                    if maybe_text.text.strip() != "":
                        text += maybe_text.text.strip()

                # Links end with <w:fldChar w:fldCharType="end" />.
                maybe_end = temp.find("fldChar[w:fldCharType]")
                if maybe_end is not None:
                    if maybe_end["w:fldCharType"] == "end":
                        break

                temp = temp.next_sibling

            links.append({"id": None, "href": url, "text": text})

    return links

if __name__ == '__main__':
    file_name="xx.docx"
    archive = zipfile.ZipFile(file_name, "r")
    file_data = archive.read("word/document.xml")
    doc_soup = BeautifulSoup(file_data, "xml")
    linked_text = get_linked_text(doc_soup)
    print(linked_text)
created at:07-28-2021
edited at: 07-28-2021: