Spaces:
Running
Running
import fetch from 'node-fetch'; | |
import { JSDOM } from 'jsdom'; | |
import pdfParse from 'pdf-parse'; | |
import puppeteer from 'puppeteer'; | |
export const extractTextFromPDF = async (buffer: Buffer): Promise<string> => { | |
const data = await pdfParse(buffer); | |
return data.text; | |
} | |
export const handleContentText = async (targetUrl: string) => { | |
const response = await fetch(targetUrl); | |
const status = response.status; | |
const contentType = response.headers.get('content-type') || ''; | |
let content; | |
if (status >= 400) { | |
// If status is 400 or greater, try using puppeteer | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished. | |
content = await page.evaluate(() => document.body.innerText); | |
await browser.close(); | |
return content; | |
} else if (contentType.includes('application/pdf')) { | |
const buffer = await response.arrayBuffer(); | |
content = await extractTextFromPDF(buffer as any); | |
} else if (contentType.includes('text/html')) { | |
const html = await response.text(); | |
const dom = new JSDOM(html); | |
const scripts = dom.window.document.querySelectorAll('script, style'); | |
scripts.forEach(element => element.remove()); | |
content = dom.window.document.body.textContent || ''; | |
} else { | |
content = await response.text(); | |
} | |
return content.trim(); | |
} | |