@@ -5,8 +5,10 @@ import { glob } from "glob";
55import { Config , configSchema } from "./config.js" ;
66import { Page } from "playwright" ;
77import { isWithinTokenLimit } from "gpt-tokenizer" ;
8+ import { PathLike } from "fs" ;
89
910let pageCounter = 0 ;
11+ let crawler : PlaywrightCrawler ;
1012
1113export function getPageHtml ( page : Page , selector = "body" ) {
1214 return page . evaluate ( ( selector ) => {
@@ -52,7 +54,7 @@ export async function crawl(config: Config) {
5254 if ( process . env . NO_CRAWL !== "true" ) {
5355 // PlaywrightCrawler crawls the web using a headless
5456 // browser controlled by the Playwright library.
55- const crawler = new PlaywrightCrawler ( {
57+ crawler = new PlaywrightCrawler ( {
5658 // Use the requestHandler to process each of the crawled pages.
5759 async requestHandler ( { request, page, enqueueLinks, log, pushData } ) {
5860 const title = await page . title ( ) ;
@@ -145,6 +147,7 @@ export async function crawl(config: Config) {
145147}
146148
147149export async function write ( config : Config ) {
150+ let nextFileNameString : PathLike = "" ;
148151 const jsonFiles = await glob ( "storage/datasets/default/*.json" , {
149152 absolute : true ,
150153 } ) ;
@@ -165,8 +168,14 @@ export async function write(config: Config) {
165168 `${ config . outputFileName . replace ( / \. j s o n $ / , "" ) } -${ fileCounter } .json` ;
166169
167170 const writeBatchToFile = async ( ) : Promise < void > => {
168- await writeFile ( nextFileName ( ) , JSON . stringify ( currentResults , null , 2 ) ) ;
169- console . log ( `Wrote ${ currentResults . length } items to ${ nextFileName ( ) } ` ) ;
171+ nextFileNameString = nextFileName ( ) ;
172+ await writeFile (
173+ nextFileNameString ,
174+ JSON . stringify ( currentResults , null , 2 ) ,
175+ ) ;
176+ console . log (
177+ `Wrote ${ currentResults . length } items to ${ nextFileNameString } ` ,
178+ ) ;
170179 currentResults = [ ] ;
171180 currentSize = 0 ;
172181 fileCounter ++ ;
@@ -215,4 +224,31 @@ export async function write(config: Config) {
215224 if ( currentResults . length > 0 ) {
216225 await writeBatchToFile ( ) ;
217226 }
227+
228+ return nextFileNameString ;
218229}
230+
231+ class GPTCrawlerCore {
232+ config : Config ;
233+
234+ constructor ( config : Config ) {
235+ this . config = config ;
236+ }
237+
238+ async crawl ( ) {
239+ await crawl ( this . config ) ;
240+ }
241+
242+ async write ( ) : Promise < PathLike > {
243+ // we need to wait for the file path as the path can change
244+ return new Promise ( ( resolve , reject ) => {
245+ write ( this . config )
246+ . then ( ( outputFilePath ) => {
247+ resolve ( outputFilePath ) ;
248+ } )
249+ . catch ( reject ) ;
250+ } ) ;
251+ }
252+ }
253+
254+ export default GPTCrawlerCore ;
0 commit comments