Summarize PDF document using Azure Open AI using Azure Machine Learning by chunks.

Balamurugan Balakreshnan

3 min readMar 4, 2023

Azure Open AI Summarize in pandas data frame

Overview

Summarize the data in pandas data frame.
Load PDF data into pandas data frame
Clean the data.
Load all pages into one row per pdf.
Using Azure Machine learning
Load the pdf in a blob container.

Code

Pip install pdfreader
Load pip reader

pip install pdfreader

from pdfreader import SimplePDFViewer

Import storage libraries.

from typing import Container
from azure.storage.blob import BlobClient, BlobServiceClient, ContainerClient
from azure.storage.blob import ResourceTypes, AccountSasPermissions
from azure.storage.blob import generate_account_sas    
from datetime import *

today = str(datetime.now().date())
print(today)

Setup connection string
Setup the key

# Source Client
connection_string = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' # The connection string for the source container
account_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxx' # The account key for the source container
# source_container_name = 'newblob' # Name of container which has blob to be copied

# Create client
client = BlobServiceClient.from_connection_string(connection_string)

initialize the container

client = BlobServiceClient.from_connection_string(connection_string)
all_containers = client.list_containers(include_metadata=True)

Create a empty data frame.

import pandas as pd

df = pd.DataFrame()

Loop the files in container.
Load the data into pdf.
load into data frame

container_client = client.get_container_client("containername")
from azure.storage.blob import BlobClient

# print(container_client)
blobs_list = container_client.list_blobs()
for blob in blobs_list:
    # Create blob client for source blob
    source_blob = BlobClient(
    client.url,
    container_name = "acccsuite"
    , blob_name = blob.name
    #,credential = sas_token
    )
    #print(blob.name)
    filename = blob.name
    blob = BlobClient.from_connection_string(conn_str=connection_string, container_name="containername", blob_name=blob.name)    with open(filename, "wb") as my_blob:
        blob_data = blob.download_blob()
        #blob_data.readinto(my_blob)
        data = blob_data.readall()
        #print(data)
        #fd = open(blob.name, "rb")
        viewer = SimplePDFViewer(data)
        all_pages = [p for p in viewer.doc.pages()]
        number_of_pages = len(all_pages)
        page_strings = ""
        #print(number_of_pages)
        for page_number in range(1, number_of_pages + 1):
            viewer.navigate(int(page_number))
            viewer.render()
            page_strings += " ".join(viewer.canvas.strings).replace('     ', '\n\n').strip()
            #print(f'Current Page Number: {page_number}')
            #print(f'Page Text: {page_strings}')        if len(page_strings) > 0:
            df = df.append({ 'text' : page_strings}, ignore_index = True)

Check the count of dataframe

df.count()

Setup open ai services

import os
import openai
openai.api_type = "azure"
openai.api_base = "https://servicename.openai.azure.com/"
openai.api_version = "2022-12-01"
openai.api_key = "xxxxxxxxxxxxxxxxxxx"

import openai
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast

Now create a column with token

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
df1['n_tokens'] = df1["text"].apply(lambda x: len(tokenizer.encode(x)))
df1 = df1[df1.n_tokens<2000]
len(df1)

now split the data into 2 chunks of each 20

dfcontent = df1.iloc[:20].copy()
dfcontent1 = df1.iloc[20:40].copy()
dfcontent2 = df1.iloc[40:].copy()

Now create a function to summarize the data

def getsummary(mystring):
    response = openai.Completion.create(
    engine="davinci003",
    prompt= 'Summarize ' + mystring,
    temperature=0.9,
    max_tokens=1000,
    top_p=1.0,
    frequency_penalty=0.0,
    presence_penalty=1
    )
    return response.choices[0].text

Now apply the function to the data frame
Because of throttling we are splitting as 20 records per data frame.

dfcontent['summary'] = dfcontent["text"].apply(lambda x : getsummary(x))

dfcontent1['summary'] = dfcontent1["text"].apply(lambda x : getsummary(x))

dfcontent2['summary'] = dfcontent2["text"].apply(lambda x : getsummary(x))

Now union the data frames

dffinal = pd.concat([dfcontent, dfcontent1])

dffinal1 = pd.concat([dffinal, dfcontent2])

remove special characters.

### Define function
def remove_special_characters(df_column,bad_characters_list):
    clean_df_column = df_column
    for bad_char in bad_characters_list:
        clean_df_column = clean_df_column.str.replace(bad_char,' ')
        print("row changes in column " + str(df_column.name) + " after removing character " + str(bad_char) + ": " ,sum(df_column!=clean_df_column))
    clean_df_column = clean_df_column.str.title()
    return clean_df_column

remove utf

def remote_non_utf8(name):
     return re.sub(r'[^\x00-\x7f]',r' ',name)
        
dffinal1['summary'] = dffinal1['summary'].apply(remote_non_utf8)

Remove bad characters

### Run function
bad_chars_lst = ["*","!","?", "(", ")", "-", "_", ",", "\n", "\\r\\n", "\r"]
dffinal1['summary'] = remove_special_characters(dffinal1['summary'],bad_chars_lst)
dffinal1['text'] = remove_special_characters(dffinal1['text'],bad_chars_lst)
display(dffinal1[["summary"]].head(20))

Clear new line

dffinal1['summary'].replace({ r'\A\s+|\s+\Z': '', '\n' : ' '}, regex=True, inplace=True)

Remove consecutive spaces

### Define function
def remove_consecutive_spaces(df_column):
    clean_df_column = df_column.replace('\s\s+', ' ', regex=True)
    print("row changes in column " + str(df_column.name) +": " ,sum(df_column!=clean_df_column))
    return clean_df_column

### Run function
dffinal1['text'] = remove_consecutive_spaces(dffinal1['text'])
dffinal1['summary'] = remove_consecutive_spaces(dffinal1['summary'])

finally save the dataframe as csv file

dffinal1.to_csv('name.csv', header=True, index=False)

Summarize PDF document using Azure Open AI using Azure Machine Learning by chunks.

Azure Open AI Summarize in pandas data frame

Overview

Code

BECOME a WRITER at MLearning.ai

Mlearning.ai Submission Suggestions

How to become a writer on Mlearning.ai

Written by Balamurugan Balakreshnan