Skip to content
Snippets Groups Projects
Commit 7332a0f3 authored by echicken's avatar echicken :chicken:
Browse files

Ingestion stuff

parent da8e6be9
Branches
No related tags found
No related merge requests found
...@@ -27,8 +27,6 @@ QDRANT_PORT=6333 ...@@ -27,8 +27,6 @@ QDRANT_PORT=6333
Create some paths: Create some paths:
```sh ```sh
mkdir -p data/binary-bob/jsobjs
mkdir -p data/binary-bob/wiki
mkdir -p data/qdrant/snapshots mkdir -p data/qdrant/snapshots
mkdir -p data/qdrant/storage mkdir -p data/qdrant/storage
``` ```
......
...@@ -7,7 +7,8 @@ ...@@ -7,7 +7,8 @@
"test": "echo \"Error: no test specified\" && exit 1", "test": "echo \"Error: no test specified\" && exit 1",
"build": "npx tsc", "build": "npx tsc",
"start": "node build/index.js", "start": "node build/index.js",
"dev": "npx tsc && node build/index.js" "dev": "npx tsc && node build/index.js",
"ingest": "npx tsc && node build/tasks/ingest.js"
}, },
"keywords": [], "keywords": [],
"author": "echicken", "author": "echicken",
......
import express, { Express, Request, Response } from 'express'; import express, { Express, Request, Response } from 'express';
import bodyParser from 'body-parser'; import bodyParser from 'body-parser';
import { init, queryAgent } from './lib/ai.js'; import { init, queryAgent } from './lib/ai.js';
import { ingest } from './lib/jsobjs.js';
const app: Express = express(); const app: Express = express();
app.use(bodyParser.urlencoded({ extended: false })); app.use(bodyParser.urlencoded({ extended: false }));
app.use(bodyParser.json()); app.use(bodyParser.json());
app.get('/embed', async (req: Request, res: Response) => {
res.json({ status: 'ok' });
});
app.get('/query/:query', async (req: Request, res: Response) => { app.get('/query/:query', async (req: Request, res: Response) => {
const answer = await queryAgent({ query: req.params.query }); const answer = await queryAgent({ query: req.params.query });
if (answer === undefined) { if (answer === undefined) {
...@@ -27,7 +22,6 @@ app.get('/', (req: Request, res: Response) => { ...@@ -27,7 +22,6 @@ app.get('/', (req: Request, res: Response) => {
(async () => { (async () => {
await init(); await init();
await ingest();
app.listen(process.env.port ?? 3000, () => { app.listen(process.env.port ?? 3000, () => {
console.log(`binary-bob is listening at http://localhost:${process.env.port ?? 3000}`); console.log(`binary-bob is listening at http://localhost:${process.env.port ?? 3000}`);
}); });
......
...@@ -68,12 +68,12 @@ async function getAgent(): Promise<OpenAIAgent> { ...@@ -68,12 +68,12 @@ async function getAgent(): Promise<OpenAIAgent> {
systemPrompt: ( systemPrompt: (
'You are a polite, friendly, helpful technical support assistant for system operators of Synchronet BBS.\r\n' 'You are a polite, friendly, helpful technical support assistant for system operators of Synchronet BBS.\r\n'
+ 'Your job is to answer questions about installing, configuring, customizing, operating, troubleshooting, and using Synchronet BBS.\r\n' + 'Your job is to answer questions about installing, configuring, customizing, operating, troubleshooting, and using Synchronet BBS.\r\n'
+ 'You should rovide step-by-step instructions when possible and be detailed in your responses.\r\n'
+ 'You will be provided with relevant context information to help you find an answer.\r\n' + 'You will be provided with relevant context information to help you find an answer.\r\n'
+ 'DO NOT include the context information in your response. Your job is to paraphrase and summarize this data.\r\n' + 'DO NOT include the context information in your response. Your job is to paraphrase this data.\r\n'
+ 'DO NOT repeat the question in your response.\r\n' + 'DO NOT repeat the question in your response.\r\n'
+ 'You MUST answer the question using only the provided context information and NOT any prior knowledge.\r\n' + 'You MUST answer the question using only the provided context information and NOT any prior knowledge.\r\n'
+ 'If no answer can be found in the context information, you MUST respond with the phrase "Answer unavailable" and nothing else.\r\n' + 'If no answer can be found in the context information, you MUST respond with the phrase "Answer unavailable" and nothing else.\r\n'
+ 'Provide step-by-step instructions when possible and be detailed in your responses.\r\n'
), ),
}); });
return agent; return agent;
......
import fs from 'fs'; import { promises as fs } from 'fs';
import path from 'path'; import path from 'path';
import puppeteer from 'puppeteer'; import puppeteer from 'puppeteer';
import { ingestText } from './ai.js'; import { ingestText } from './ai.js';
...@@ -23,6 +23,7 @@ interface Property { ...@@ -23,6 +23,7 @@ interface Property {
}; };
interface Scope { interface Scope {
url: string,
methods: Record<string, Method>, methods: Record<string, Method>,
properties: Record<string, Property>, properties: Record<string, Property>,
}; };
...@@ -30,6 +31,9 @@ interface Scope { ...@@ -30,6 +31,9 @@ interface Scope {
type JSObjs = Record<string, Scope>; type JSObjs = Record<string, Scope>;
export async function extractData(): Promise<void> { export async function extractData(): Promise<void> {
console.debug('\tExtracting data from the Synchronet JavaScript Object Model Reference');
console.debug(`\t\tCreating ${OUT_BASE}`);
await fs.mkdir(OUT_BASE, { recursive: true });
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome', executablePath: '/usr/bin/google-chrome',
...@@ -38,6 +42,7 @@ export async function extractData(): Promise<void> { ...@@ -38,6 +42,7 @@ export async function extractData(): Promise<void> {
], ],
}); });
console.debug(`\t\tDownloading document from ${URL}`);
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto(URL); await page.goto(URL);
await page.setViewport({ width: 1080, height: 1024 }); await page.setViewport({ width: 1080, height: 1024 });
...@@ -47,7 +52,15 @@ export async function extractData(): Promise<void> { ...@@ -47,7 +52,15 @@ export async function extractData(): Promise<void> {
if (elements === undefined || elements === null) return; if (elements === undefined || elements === null) return;
for (const element of elements) { for (const element of elements) {
const scope = element.querySelector('caption > b > tt').innerHTML; const scope = element.querySelector('caption > b > tt').innerHTML;
if (sections[scope] === undefined) sections[scope] = { methods: {}, properties: {} }; if (sections[scope] === undefined) {
const h2 = element.previousElementSibling;
const url = URL + '#' + (h2 !== null && h2.tagName === 'h2' ? h2.querySelector('a').getAttribute('name') : scope);
sections[scope] = {
url,
methods: {},
properties: {},
};
}
const type = element.querySelector('caption > b > a').innerHTML; const type = element.querySelector('caption > b > a').innerHTML;
const rows = element.querySelectorAll('tbody > tr'); const rows = element.querySelectorAll('tbody > tr');
if (type.trim() === 'methods') { if (type.trim() === 'methods') {
...@@ -79,16 +92,31 @@ export async function extractData(): Promise<void> { ...@@ -79,16 +92,31 @@ export async function extractData(): Promise<void> {
} }
return sections; return sections;
}); });
fs.writeFileSync(OUT_FILE, JSON.stringify(sections));
console.debug(`\t\tWriting data to ${OUT_FILE}`);
await fs.writeFile(OUT_FILE, JSON.stringify(sections));
await browser.close(); await browser.close();
} }
export async function ingest(): Promise<void> { export async function ingest(): Promise<void> {
console.debug('\tGenerating text from the Synchronet JavaScript Object Model Reference');
const outDir = path.join(OUT_BASE, 'text');
console.debug(`\t\tCreating ${outDir}`);
await fs.mkdir(outDir, { recursive: true });
await extractData(); await extractData();
const jsobjs: JSObjs = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
const json = await fs.readFile(OUT_FILE, 'utf8');
const jsobjs: JSObjs = JSON.parse(json);
for (const scope in jsobjs) { for (const scope in jsobjs) {
console.debug(`\t\tGenerating text for ${scope}`);
const prefix = `${scope}${scope === 'global' ? ' ' : '.'}`; const prefix = `${scope}${scope === 'global' ? ' ' : '.'}`;
let str = `The following document describes the "${scope}" object of the Synchronet javascript object model. This object is available to scripts running in the javascript runtime environment of Synchronet BBS.\r\n`; let str = `The following document describes the "${scope}" object of the Synchronet javascript object model. This object is available to scripts running in the javascript runtime environment of Synchronet BBS.\r\n`;
console.debug(`\t\t\tGenerating text for ${scope} methods`);
str += `\r\n---\r\n\r\nThe following is a list of methods on the "${scope}" object:\r\n`; str += `\r\n---\r\n\r\nThe following is a list of methods on the "${scope}" object:\r\n`;
for (const method in jsobjs[scope].methods) { for (const method in jsobjs[scope].methods) {
const _prefix = `${prefix}${method}`; const _prefix = `${prefix}${method}`;
...@@ -98,6 +126,8 @@ export async function ingest(): Promise<void> { ...@@ -98,6 +126,8 @@ export async function ingest(): Promise<void> {
str += `The description of the "${_prefix}" method is: "${jsobjs[scope].methods[method].description}"\r\n`; str += `The description of the "${_prefix}" method is: "${jsobjs[scope].methods[method].description}"\r\n`;
str += `The "${_prefix}" method was introduced in Synchronet "${jsobjs[scope].methods[method].version}"\r\n`; str += `The "${_prefix}" method was introduced in Synchronet "${jsobjs[scope].methods[method].version}"\r\n`;
} }
console.debug(`\t\t\tGenerating text for ${scope} properties`);
str += `\r\n---\r\n\r\nThe following is a list of properties of the "${scope}" object:\r\n`; str += `\r\n---\r\n\r\nThe following is a list of properties of the "${scope}" object:\r\n`;
for (const property in jsobjs[scope].properties) { for (const property in jsobjs[scope].properties) {
const _prefix = `${prefix}${property}`; const _prefix = `${prefix}${property}`;
...@@ -106,6 +136,12 @@ export async function ingest(): Promise<void> { ...@@ -106,6 +136,12 @@ export async function ingest(): Promise<void> {
str += `The description of the "${_prefix}" property is: "${jsobjs[scope].properties[property].description}"\r\n`; str += `The description of the "${_prefix}" property is: "${jsobjs[scope].properties[property].description}"\r\n`;
str += `The "${_prefix}" property was introduced in Synchronet "${jsobjs[scope].properties[property].version}"\r\n`; str += `The "${_prefix}" property was introduced in Synchronet "${jsobjs[scope].properties[property].version}"\r\n`;
} }
fs.writeFileSync(path.join(OUT_BASE, `${scope}.txt`), str);
const outFile = path.join(outDir, `${scope}.txt`);
console.debug(`\t\t\tWriting ${outFile}`);
await fs.writeFile(outFile, str);
console.debug(`\t\tIngesting text for ${scope}`);
await ingestText(str, jsobjs[scope].url);
} }
} }
import fs from 'fs'; import { promises as fs } from 'fs';
import path from 'path'; import path from 'path';
import puppeteer from 'puppeteer'; import puppeteer from 'puppeteer';
import { ingestText, resetCollection } from './ai.js'; import { ingestText } from './ai.js';
const BASE_URL = 'https://wiki.synchro.net'; const BASE_URL = 'https://wiki.synchro.net';
const OUT_FILE = path.join(path.resolve(), 'data', 'wiki', 'wiki.json'); const OUT_DIR = path.join(path.resolve(), 'data', 'wiki');
const OUT_FILE = path.join(OUT_DIR, 'wiki.json');
export async function extractData(force: boolean = false): Promise<void> { export async function extractData(): Promise<void> {
console.debug(`\tCreating ${OUT_DIR}`);
if (!force && fs.existsSync(OUT_FILE)) { await fs.mkdir(OUT_DIR, { recursive: true });
const { mtime } = fs.statSync(OUT_FILE);
if ((new Date()).getTime() - mtime.getTime() < (1000 * 60 * 60 * 24)) return;
}
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome', executablePath: '/usr/bin/google-chrome',
...@@ -20,8 +18,10 @@ export async function extractData(force: boolean = false): Promise<void> { ...@@ -20,8 +18,10 @@ export async function extractData(force: boolean = false): Promise<void> {
], ],
}); });
const indexUrl = `${BASE_URL}/wiki:site_index`;
console.debug(`\tDownloading index from ${indexUrl}`);
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto(`${BASE_URL}/wiki:site_index`); await page.goto(indexUrl);
await page.setViewport({ width: 1080, height: 1024 }); await page.setViewport({ width: 1080, height: 1024 });
const paths = await page.evaluate(() => { const paths = await page.evaluate(() => {
const elements = document.querySelector('div > div.indexmenu_nojs > ul[role="tree"].idx').getElementsByTagName('a'); const elements = document.querySelector('div > div.indexmenu_nojs > ul[role="tree"].idx').getElementsByTagName('a');
...@@ -37,9 +37,10 @@ export async function extractData(force: boolean = false): Promise<void> { ...@@ -37,9 +37,10 @@ export async function extractData(force: boolean = false): Promise<void> {
for (const p of paths) { for (const p of paths) {
if (p.search(/^\/es:/) === 0) continue; if (p.search(/^\/es:/) === 0) continue;
if (p.search(/^\/wiki:/) === 0) continue; if (p.search(/^\/wiki:/) === 0) continue;
console.debug(p); const pageUrl = `${BASE_URL}${p}`;
console.debug(`\t\tDownloading ${pageUrl}`);
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto(`${BASE_URL}${p}`); await page.goto(pageUrl);
await page.setViewport({ width: 1080, height: 1024 }); await page.setViewport({ width: 1080, height: 1024 });
const content = await page.evaluate(() => { const content = await page.evaluate(() => {
const body = document.getElementById('bodyContent')?.innerHTML; const body = document.getElementById('bodyContent')?.innerHTML;
...@@ -52,16 +53,18 @@ export async function extractData(force: boolean = false): Promise<void> { ...@@ -52,16 +53,18 @@ export async function extractData(force: boolean = false): Promise<void> {
pages[p] = content; pages[p] = content;
} }
fs.writeFileSync(OUT_FILE, JSON.stringify(pages)); console.debug(`\tWriting data to ${OUT_FILE}`);
await fs.writeFile(OUT_FILE, JSON.stringify(pages));
await browser.close(); await browser.close();
} }
export async function ingest(): Promise<void> { export async function ingest(): Promise<void> {
await extractData(); await extractData();
const wiki: Record<string, string> = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8')); const json = await fs.readFile(OUT_FILE, 'utf8');
const wiki: Record<string, string> = JSON.parse(json);
console.debug('\tIngesting Synchronet wiki data');
for (const page in wiki) { for (const page in wiki) {
console.debug(`\t\tIngesting data for ${BASE_URL}${page}`);
await ingestText(wiki[page], `${BASE_URL}${page}`); await ingestText(wiki[page], `${BASE_URL}${page}`);
} }
} }
import { resetCollection } from '../lib/ai.js';
import { ingest as ingestJsobjs } from '../lib/jsobjs.js';
import { ingest as ingestWiki } from '../lib/wiki.js';
async function ingest(): Promise<void> {
console.debug('Resetting vector store');
await resetCollection();
console.debug('Ingesting the Synchronet JavaScript Object Model Reference');
await ingestJsobjs();
console.debug('Ingesting data from the Synchronet wiki');
await ingestWiki();
}
(async () => {
await ingest();
})();
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment