Document Summariser

Below is a Python script designed to parse a long text document containing headings within curly braces (e.g., {1}, {1.1}, {2.3.1}), split the document into separate sections, and generate concise academic-style summaries for each of those sections. The script preserves references, technical language, and headings in a Markdown output file. It uses the OpenAI API to produce each summary at approximately one-third the length of its original content.

How It Works

  1. Parses and Structures Text

    • The script looks for headings wrapped in curly braces and splits the text accordingly.
    • Each section is stored with its heading, title, and content.
  2. Chunking Large Sections

    • If a section is excessively long, it is split into chunks (roughly by paragraphs) to avoid exceeding the maximum token limit.
  3. Summarisation

    • For each chunk, a concise summary is generated by the OpenAI model, aiming to preserve the author’s original tone and vital technical details.
  4. Retries for Empty Summaries

    • If a summary comes back empty or fails, the script retries once to obtain a valid summary.
  5. Outputs Markdown

    • The final summaries, complete with curly-braced headings and concise text, are saved to a file called summarised_output.md.

Usage Overview

  • Prerequisites:

    • pip install openai
  • Setup:

    • Replace "YOUR_OPENAI_API_KEY" with your real OpenAI API key.
    • Modify example_document to contain your text, ensuring each heading is in the form {x.x.x}.
  • Run the Script:

    • python structured_document_summariser.py
    • A Markdown file (summarised_output.md) will be created with your summarised sections.

Below is the script with concise in-code comments. Note that identifying information and any actual API keys or personal file paths have been replaced with placeholders.

#!/usr/bin/env python3
"""
structured_document_summariser.py

A script that:
1) Accepts a large text with curly-bracket headings (e.g., {1}, {1.1}, {2.3.1}).
2) Splits the text into sections based on these headings.
3) Summarises each section in detailed academic prose (no bullet points).
4) Preserves references, technical language, and headings in the final Markdown output.
5) Saves the result to 'summarised_output.md' in the current directory.

Each summary is approximately one-third the length of the original section, ensuring conciseness while preserving main ideas.
"""

import os
import re
import math
from openai import OpenAI

# ------------------------------------------------------------------------------
# 0. SET OPENAI API CLIENT
# ------------------------------------------------------------------------------
client = OpenAI(api_key="YOUR_OPENAI_API_KEY")

# ------------------------------------------------------------------------------
# 1. PARSE DOCUMENT INTO SECTIONS
# ------------------------------------------------------------------------------
def parse_document_to_sections(text: str):
    """
    Splits the text based on curly-bracket headings (e.g., {1}, {1.2.3.}).
    Each heading and its content become a distinct section.
    """
    pattern = r"^\{(\d+(\.\d+)*\.?)\}\s+(.*)"
    sections = []
    current_section = None

    for line in text.split("\n"):
        line = line.rstrip()
        match = re.match(pattern, line)
        if match:
            # Append any previously gathered section
            if current_section:
                current_section["has_original_content"] = bool(current_section["content"].strip())
                sections.append(current_section)
            heading = match.group(1)
            title = match.group(3)
            current_section = {
                "heading": heading,
                "title": title,
                "content": "",
                "summary": "",
                "retry_count": 0,
                "has_original_content": False
            }
        else:
            if current_section:
                current_section["content"] += line + "\n"

    # Append the last section if present
    if current_section:
        current_section["has_original_content"] = bool(current_section["content"].strip())
        sections.append(current_section)

    return sections

# ------------------------------------------------------------------------------
# 2. OPTIONAL CHUNKING FOR LENGTHY CONTENT
# ------------------------------------------------------------------------------
def chunk_content(content: str, max_completion_tokens: int = 2000):
    """
    Splits content on double newlines to avoid exceeding token limits.
    """
    paragraphs = content.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        approx_tokens = len((current_chunk + para).split())
        if approx_tokens > max_completion_tokens:
            if current_chunk.strip():
                chunks.append(current_chunk.strip())
            current_chunk = para
        else:
            current_chunk += "\n\n" + para

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

# ------------------------------------------------------------------------------
# 3. DETERMINE TARGET SUMMARY LENGTH
# ------------------------------------------------------------------------------
def analyze_section_length(text: str) -> int:
    return len(text.split())

def compute_target_summary_length(token_count: int) -> int:
    return int(math.ceil((1/3) * token_count))

# ------------------------------------------------------------------------------
# 4. SUMMARISE EACH SECTION
# ------------------------------------------------------------------------------
def summarize_section(section: dict):
    content_chunks = chunk_content(section["content"], max_completion_tokens=2000)
    token_count = analyze_section_length(section["content"])
    target_length = compute_target_summary_length(token_count)

    sub_summaries = []
    for idx, chunk in enumerate(content_chunks, start=1):
        prompt = (
            "You are an academic writer skilled in British English. "
            "Summarise the text below in a concise manner, preserving essential technical language, references, critical reasoning, and the author's narrative voice. "
            "Retain justifications and context crucial to the argument. "
            f"Aim for about {target_length} words (one-third of the original). "
            "Text:\n\n"
            f"\"\"\"{chunk}\"\"\""
        )
        try:
            response = client.chat.completions.create(
                model="gpt-4o-2024-08-06",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0,
                max_completion_tokens=2048
            )
            chunk_summary = response.choices[0].message.content.strip()
            sub_summaries.append(chunk_summary)
        except Exception as e:
            sub_summaries.append(f"[Error summarising chunk {idx} of section {section['heading']}: {str(e)}]")

    return " ".join(sub_summaries)

"""
# ------------------------------------------------------------------------------
# 5. GENERATE MARKDOWN WITH HEADING LEVELS
# ------------------------------------------------------------------------------
def generate_markdown(sections: list):
    md_output = []
    for sec in sections:
        # Determine heading depth by counting dots
        num_dots = sec["heading"].count(".")
        num_hashes = 4 + num_dots
        heading_line = f"{'#' * num_hashes} {{{sec['heading']}}} {sec['title']}"
        summary_content = sec.get("summary", "").strip()
        md_output.append(heading_line + "\n\n" + summary_content + "\n")
    return "\n".join(md_output)
"""

# ------------------------------------------------------------------------------
# 5. GENERATE MARKDOWN
# ------------------------------------------------------------------------------
def generate_markdown(sections: list):
    """
    Outputs each section's heading with curly braces, followed by its summary.
    """
    md_output = []
    for sec in sections:
        heading_line = f"{{{sec['heading']}}} {sec['title']}"
        summary_content = sec.get("summary", "").strip()
        md_output.append(heading_line + "\n\n" + summary_content + "\n")
    return "\n".join(md_output)

# ------------------------------------------------------------------------------
# 6. RETRY EMPTY SUMMARIES
# ------------------------------------------------------------------------------
def retry_empty_sections(sections: list):
    """
    If any section with original content lacks a summary, retry once.
    """
    for sec in sections:
        if sec["has_original_content"] and not sec.get("summary", "").strip() and sec["retry_count"] < 1:
            print(f"Retrying Section {sec['heading']} ({sec['title']})...")
            sec["summary"] = summarize_section(sec)
            sec["retry_count"] += 1
            if not sec["summary"].strip():
                sec["summary"] = "[Error: Unable to summarise this section after retry.]"

# ------------------------------------------------------------------------------
# 7. MAIN FUNCTION
# ------------------------------------------------------------------------------
def main():
    # Replace with your actual document (ensure headings in curly braces).
    example_document = """
    
    """

    sections = parse_document_to_sections(example_document)

    # Summarise each section
    for sec in sections:
        print(f"Summarising Section {sec['heading']} ({sec['title']})...")
        sec["summary"] = summarize_section(sec)

    # Generate Markdown
    md_output = generate_markdown(sections)
    output_file = "summarised_output.md"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(md_output)

    print(f"\nSummarisation complete. Output saved to '{os.path.abspath(output_file)}'.")

    # Retry if needed
    retry_empty_sections(sections)

    # Regenerate Markdown after retries
    md_output = generate_markdown(sections)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(md_output)

    print(f"Updated Markdown with retried sections saved to '{os.path.abspath(output_file)}'.")

if __name__ == "__main__":
    main()

Leave a comment