Initial commit

2e49b555 · echicken · 2e49b555 · 2e49b555 · 2e49b555 · 2e49b555
Commit 2e49b555 authored 1 year ago by echicken
--- a/.gitignore
+++ b/.gitignore
+.env
+node_modules
+build
+data
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM --platform=linux/amd64 node:20
+
+# We don't need the standalone Chromium
+ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
+
+# Install Google Chrome Stable and fonts
+# Note: this installs the necessary libs to make the browser work with Puppeteer.
+RUN apt-get update && apt-get install curl gnupg -y \
+  && curl --location --silent https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
+  && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
+  && apt-get update \
+  && apt-get install google-chrome-stable -y --no-install-recommends \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . /app/
+RUN npm install
+RUN npm run build
+
+CMD ["npm", "start"]
\ No newline at end of file
--- a/compose.yaml
+++ b/compose.yaml
+services:
+  qdrant:
+    # image: qdrant/qdrant
+    build:
+      context: .
+      dockerfile_inline: |
+        FROM qdrant/qdrant:latest
+        RUN apt-get update -yq && apt-get install -yqq curl
+    ports:
+      - 6333:6333
+    volumes:
+      - ./data/qdrant/storage:/qdrant/storage
+      - ./data/qdrant/snapshots:/qdrant/snapshots
+      - ./conf/qdrant/config.yaml:/qdrant/config/config.yaml
+    healthcheck:
+      test: curl -s http://localhost:6333/healthz | grep -q 'healthz check passed' || exit 1
+      interval: 1m
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+      start_interval: 5s
+  binary-bob:
+    build: .
+    depends_on:
+      qdrant:
+        condition: service_healthy
+    volumes:
+      - ./data/binary-bob:/app/data
+    ports:
+      - 3000:3000
+    
\ No newline at end of file
--- a/conf/qdrant/config.yaml
+++ b/conf/qdrant/config.yaml
+log_level: INFO
+ 
+storage:
+  # Where to store all the data
+  storage_path: ./storage
+ 
+  # Where to store snapshots
+  snapshots_path: ./snapshots
+ 
+  snapshots_config:
+    # "local" or "s3" - where to store snapshots
+    snapshots_storage: local
+    # s3_config:
+    #   bucket: ""
+    #   region: ""
+    #   access_key: ""
+    #   secret_key: ""
+ 
+  # Where to store temporary files
+  # If null, temporary snapshot are stored in: storage/snapshots_temp/
+  temp_path: null
+ 
+  # If true - point's payload will not be stored in memory.
+  # It will be read from the disk every time it is requested.
+  # This setting saves RAM by (slightly) increasing the response time.
+  # Note: those payload values that are involved in filtering and are indexed - remain in RAM.
+  on_disk_payload: true
+ 
+  # Maximum number of concurrent updates to shard replicas
+  # If `null` - maximum concurrency is used.
+  update_concurrency: null
+ 
+  # Write-ahead-log related configuration
+  wal:
+    # Size of a single WAL segment
+    wal_capacity_mb: 32
+ 
+    # Number of WAL segments to create ahead of actual data requirement
+    wal_segments_ahead: 0
+ 
+  # Normal node - receives all updates and answers all queries
+  node_type: "Normal"
+ 
+  # Listener node - receives all updates, but does not answer search/read queries
+  # Useful for setting up a dedicated backup node
+  # node_type: "Listener"
+ 
+  performance:
+    # Number of parallel threads used for search operations. If 0 - auto selection.
+    max_search_threads: 0
+ 
+    # Max number of threads (jobs) for running optimizations across all collections, each thread runs one job.
+    # If 0 - have no limit and choose dynamically to saturate CPU.
+    # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building.
+    max_optimization_threads: 0
+ 
+    # CPU budget, how many CPUs (threads) to allocate for an optimization job.
+    # If 0 - auto selection, keep 1 or more CPUs unallocated depending on CPU size
+    # If negative - subtract this number of CPUs from the available CPUs.
+    # If positive - use this exact number of CPUs.
+    optimizer_cpu_budget: 0
+ 
+    # Prevent DDoS of too many concurrent updates in distributed mode.
+    # One external update usually triggers multiple internal updates, which breaks internal
+    # timings. For example, the health check timing and consensus timing.
+    # If null - auto selection.
+    update_rate_limit: null
+ 
+    # Limit for number of incoming automatic shard transfers per collection on this node, does not affect user-requested transfers.
+    # The same value should be used on all nodes in a cluster.
+    # Default is to allow 1 transfer.
+    # If null - allow unlimited transfers.
+    #incoming_shard_transfers_limit: 1
+ 
+    # Limit for number of outgoing automatic shard transfers per collection on this node, does not affect user-requested transfers.
+    # The same value should be used on all nodes in a cluster.
+    # Default is to allow 1 transfer.
+    # If null - allow unlimited transfers.
+    #outgoing_shard_transfers_limit: 1
+ 
+  optimizers:
+    # The minimal fraction of deleted vectors in a segment, required to perform segment optimization
+    deleted_threshold: 0.2
+ 
+    # The minimal number of vectors in a segment, required to perform segment optimization
+    vacuum_min_vector_number: 1000
+ 
+    # Target amount of segments optimizer will try to keep.
+    # Real amount of segments may vary depending on multiple parameters:
+    #  - Amount of stored points
+    #  - Current write RPS
+    #
+    # It is recommended to select default number of segments as a factor of the number of search threads,
+    # so that each segment would be handled evenly by one of the threads.
+    # If `default_segment_number = 0`, will be automatically selected by the number of available CPUs
+    default_segment_number: 0
+ 
+    # Do not create segments larger this size (in KiloBytes).
+    # Large segments might require disproportionately long indexation times,
+    # therefore it makes sense to limit the size of segments.
+    #
+    # If indexation speed have more priority for your - make this parameter lower.
+    # If search speed is more important - make this parameter higher.
+    # Note: 1Kb = 1 vector of size 256
+    # If not set, will be automatically selected considering the number of available CPUs.
+    max_segment_size_kb: null
+ 
+    # Maximum size (in KiloBytes) of vectors to store in-memory per segment.
+    # Segments larger than this threshold will be stored as read-only memmaped file.
+    # To enable memmap storage, lower the threshold
+    # Note: 1Kb = 1 vector of size 256
+    # To explicitly disable mmap optimization, set to `0`.
+    # If not set, will be disabled by default.
+    memmap_threshold_kb: null
+ 
+    # Maximum size (in KiloBytes) of vectors allowed for plain index.
+    # Default value based on https://github.com/google-research/google-research/blob/master/scann/docs/algorithms.md
+    # Note: 1Kb = 1 vector of size 256
+    # To explicitly disable vector indexing, set to `0`.
+    # If not set, the default value will be used.
+    indexing_threshold_kb: 20000
+ 
+    # Interval between forced flushes.
+    flush_interval_sec: 5
+ 
+    # Max number of threads (jobs) for running optimizations per shard.
+    # Note: each optimization job will also use `max_indexing_threads` threads by itself for index building.
+    # If null - have no limit and choose dynamically to saturate CPU.
+    # If 0 - no optimization threads, optimizations will be disabled.
+    max_optimization_threads: null
+ 
+  # This section has the same options as 'optimizers' above. All values specified here will overwrite the collections
+  # optimizers configs regardless of the config above and the options specified at collection creation.
+  #optimizers_overwrite:
+  #  deleted_threshold: 0.2
+  #  vacuum_min_vector_number: 1000
+  #  default_segment_number: 0
+  #  max_segment_size_kb: null
+  #  memmap_threshold_kb: null
+  #  indexing_threshold_kb: 20000
+  #  flush_interval_sec: 5
+  #  max_optimization_threads: null
+ 
+  # Default parameters of HNSW Index. Could be overridden for each collection or named vector individually
+  hnsw_index:
+    # Number of edges per node in the index graph. Larger the value - more accurate the search, more space required.
+    m: 16
+ 
+    # Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index.
+    ef_construct: 100
+ 
+    # Minimal size (in KiloBytes) of vectors for additional payload-based indexing.
+    # If payload chunk is smaller than `full_scan_threshold_kb` additional indexing won't be used -
+    # in this case full-scan search should be preferred by query planner and additional indexing is not required.
+    # Note: 1Kb = 1 vector of size 256
+    full_scan_threshold_kb: 10000
+ 
+    # Number of parallel threads used for background index building.
+    # If 0 - automatically select.
+    # Best to keep between 8 and 16 to prevent likelihood of building broken/inefficient HNSW graphs.
+    # On small CPUs, less threads are used.
+    max_indexing_threads: 0
+ 
+    # Store HNSW index on disk. If set to false, index will be stored in RAM. Default: false
+    on_disk: false
+ 
+    # Custom M param for hnsw graph built for payload index. If not set, default M will be used.
+    payload_m: null
+ 
+  # Default shard transfer method to use if none is defined.
+  # If null - don't have a shard transfer preference, choose automatically.
+  # If stream_records, snapshot or wal_delta - prefer this specific method.
+  # More info: https://qdrant.tech/documentation/guides/distributed_deployment/#shard-transfer-method
+  shard_transfer_method: null
+ 
+  # Default parameters for collections
+  collection:
+    # Number of replicas of each shard that network tries to maintain
+    replication_factor: 1
+ 
+    # How many replicas should apply the operation for us to consider it successful
+    write_consistency_factor: 1
+ 
+    # Default parameters for vectors.
+    vectors:
+      # Whether vectors should be stored in memory or on disk.
+      on_disk: true
+ 
+    # shard_number_per_node: 1
+ 
+    # Default quantization configuration.
+    # More info: https://qdrant.tech/documentation/guides/quantization
+    quantization: null
+ 
+service:
+  # Maximum size of POST data in a single request in megabytes
+  max_request_size_mb: 32
+ 
+  # Number of parallel workers used for serving the api. If 0 - equal to the number of available cores.
+  # If missing - Same as storage.max_search_threads
+  max_workers: 0
+ 
+  # Host to bind the service on
+  host: 0.0.0.0
+ 
+  # HTTP(S) port to bind the service on
+  http_port: 6333
+ 
+  # gRPC port to bind the service on.
+  # If `null` - gRPC is disabled. Default: null
+  # Comment to disable gRPC:
+  # grpc_port: 6334
+ 
+  # Enable CORS headers in REST API.
+  # If enabled, browsers would be allowed to query REST endpoints regardless of query origin.
+  # More info: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
+  # Default: true
+  enable_cors: true
+ 
+  # Enable HTTPS for the REST and gRPC API
+  enable_tls: false
+ 
+  # Check user HTTPS client certificate against CA file specified in tls config
+  verify_https_client_certificate: false
+ 
+  # Set an api-key.
+  # If set, all requests must include a header with the api-key.
+  # example header: `api-key: <API-KEY>`
+  #
+  # If you enable this you should also enable TLS.
+  # (Either above or via an external service like nginx.)
+  # Sending an api-key over an unencrypted channel is insecure.
+  #
+  # Uncomment to enable.
+  # api_key: your_secret_api_key_here
+ 
+  # Set an api-key for read-only operations.
+  # If set, all requests must include a header with the api-key.
+  # example header: `api-key: <API-KEY>`
+  #
+  # If you enable this you should also enable TLS.
+  # (Either above or via an external service like nginx.)
+  # Sending an api-key over an unencrypted channel is insecure.
+  #
+  # Uncomment to enable.
+  # read_only_api_key: your_secret_read_only_api_key_here
+ 
+  # Uncomment to enable JWT Role Based Access Control (RBAC).
+  # If enabled, you can generate JWT tokens with fine-grained rules for access control.
+  # Use generated token instead of API key.
+  #
+  # jwt_rbac: true
+ 
+cluster:
+  # Use `enabled: true` to run Qdrant in distributed deployment mode
+  enabled: false
+ 
+  # Configuration of the inter-cluster communication
+  p2p:
+    # Port for internal communication between peers
+    port: 6335
+ 
+    # Use TLS for communication between peers
+    enable_tls: false
+ 
+  # Configuration related to distributed consensus algorithm
+  consensus:
+    # How frequently peers should ping each other.
+    # Setting this parameter to lower value will allow consensus
+    # to detect disconnected nodes earlier, but too frequent
+    # tick period may create significant network and CPU overhead.
+    # We encourage you NOT to change this parameter unless you know what you are doing.
+    tick_period_ms: 100
+ 
+ 
+# Set to true to prevent service from sending usage statistics to the developers.
+# Read more: https://qdrant.tech/documentation/guides/telemetry
+telemetry_disabled: false
+ 
+ 
+# TLS configuration.
+# Required if either service.enable_tls or cluster.p2p.enable_tls is true.
+# tls:
+#   # Server certificate chain file
+#   cert: ./tls/cert.pem
+ 
+#   # Server private key file
+#   key: ./tls/key.pem
+ 
+#   # Certificate authority certificate file.
+#   # This certificate will be used to validate the certificates
+#   # presented by other nodes during inter-cluster communication.
+#   #
+#   # If verify_https_client_certificate is true, it will verify
+#   # HTTPS client certificate
+#   #
+#   # Required if cluster.p2p.enable_tls is true.
+#   # ca_cert: ./tls/cacert.pem
+ 
+#   # TTL in seconds to reload certificate from disk, useful for certificate rotations.
+#   # Only works for HTTPS endpoints. Does not support gRPC (and intra-cluster communication).
+#   # If `null` - TTL is disabled.
+#   cert_ttl: 3600
\ No newline at end of file
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
+{
+  "name": "binary-bob",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1",
+    "build": "npx tsc",
+    "start": "node build/index.js",
+    "dev": "npx tsc && node build/index.js"
+  },
+  "keywords": [],
+  "author": "echicken",
+  "license": "MIT",
+  "devDependencies": {
+    "@swc/cli": "^0.4.0",
+    "@swc/core": "^1.6.13",
+    "@types/express": "^4.17.21",
+    "@types/node": "^20.14.10",
+    "typescript": "^5.5.3"
+  },
+  "dependencies": {
+    "body-parser": "^1.20.2",
+    "dotenv": "^16.4.5",
+    "express": "^4.19.2",
+    "llamaindex": "^0.5.3",
+    "puppeteer": "^22.13.0"
+  },
+  "type": "module"
+}
--- a/src/index.ts
+++ b/src/index.ts
+import express, { Express, Request, Response } from 'express';
+import bodyParser from 'body-parser';
+import { init, queryAgent } from './lib/ai.js';
+import { embedData } from './lib/wiki.js';
+
+const app: Express = express();
+
+app.use(bodyParser.urlencoded({ extended: false }));
+app.use(bodyParser.json());
+
+app.get('/embed', async (req: Request, res: Response) => {
+	await embedData();
+	res.json({ status: 'ok' });
+});
+
+app.get('/query/:query', async (req: Request, res: Response) => {
+	const answer = await queryAgent({ query: req.params.query });
+	if (answer === undefined) {
+		res.sendStatus(404);
+	} else {
+		res.json(answer);
+	}
+});
+
+app.get('/', (req: Request, res: Response) => {
+	res.sendStatus(404);
+});
+
+(async () => {
+	await init();
+	app.listen(process.env.port ?? 3000, () => {
+		console.log(`binary-bob is listening at http://localhost:${process.env.port ?? 3000}`);
+	});
+})();
\ No newline at end of file
--- a/src/lib/ai.ts
+++ b/src/lib/ai.ts
+import type { EvaluationResult } from "llamaindex/evaluation/types";
+import type { BaseToolWithCall, EngineResponse, Metadata, NodeWithScore, RelatedNodeInfo, TextNode } from 'llamaindex';
+import { CallbackManager, CompactAndRefine, Document, IngestionPipeline, MetadataMode, OpenAI, OpenAIAgent, OpenAIEmbedding, QdrantVectorStore, QueryEngine, QueryEngineTool, RelevancyEvaluator, ResponseSynthesizer, RetrieverQueryEngine, Settings, SimpleNodeParser, TextQaPrompt, VectorStoreIndex } from 'llamaindex';
+import config from './config.js';
+
+// configure LLM
+Settings.llm = new OpenAI({ model: config.llm }) as any;
+// configure embedding model
+Settings.embedModel = new OpenAIEmbedding({
+	model: config.embeddingModel,
+	dimensions: config.embeddingDimension,
+});
+
+const vectorStore = new QdrantVectorStore({
+	collectionName: config.qdrantCollection,
+	url: config.qdrantUrl,
+	embedModel: Settings.embedModel,
+});
+
+interface Source {
+	title: string,
+	url: string,
+}
+
+interface Answer {
+	answer: string,
+	sources: Source[],
+	relevance: {
+		passing: boolean,
+		score: number,
+	},
+};
+
+const newTextQaPrompt: TextQaPrompt = ({ context, query }): string => {
+	return `Context:\r\n${context}\r\n\r\n---\r\n\r\nQuestion:\r\n${query}\r\n\r\n---\r\n\r\nResponse:\r\n`;
+}
+
+export async function init(): Promise<void> {
+	await vectorStore.initializeCollection(config.embeddingDimension);
+}
+
+async function getQueryEngine(): Promise<QueryEngine & RetrieverQueryEngine> {
+	const responseSynthesizer = new ResponseSynthesizer({ responseBuilder: new CompactAndRefine(undefined, newTextQaPrompt)});
+	const index = await VectorStoreIndex.fromVectorStore(vectorStore);
+	const retriever = index.asRetriever({ similarityTopK: 3 });
+	const queryEngine = index.asQueryEngine({ responseSynthesizer, retriever });
+	return queryEngine;
+}
+
+async function getQueryEngineTools(): Promise<BaseToolWithCall[]> {
+	const queryEngine = await getQueryEngine();
+	const queryEngineTool = new QueryEngineTool({
+		queryEngine,
+		metadata: {
+			name: 'synchronet_query_engine',
+			description: 'Use this engine to answer questions about installing, configuring, customizing, operating, troubleshooting, and using Synchronet BBS.',
+		},
+	});
+	return [queryEngineTool];
+}
+
+async function getAgent(): Promise<OpenAIAgent> {
+	const tools = await getQueryEngineTools();
+	const agent = new OpenAIAgent({
+		llm: Settings.llm,
+		tools,
+		verbose: true,
+		systemPrompt: (
+			'You are a polite, friendly, helpful technical support assistant for system operators of Synchronet BBS.\r\n'
+			+ 'Your job is to answer questions about installing, configuring, customizing, operating, troubleshooting, and using Synchronet BBS.\r\n'
+			+ 'You will be provided with relevant context information to help you find an answer.\r\n'
+			+ 'DO NOT include the context information in your response. Your job is to paraphrase and summarize this data.\r\n'
+			+ 'DO NOT repeat the question in your response.\r\n'
+			+ 'You MUST answer the question using only the provided context information and NOT any prior knowledge.\r\n'
+			+ 'If no answer can be found in the context information, you MUST respond with the phrase "Answer unavailable" and nothing else.\r\n'
+			+ 'Provide step-by-step instructions when possible and be detailed in your responses.\r\n'
+		),
+	});
+	return agent;
+}
+
+async function evaluateResponse(query: string, response: EngineResponse, nodes: NodeWithScore<Metadata>[]): Promise<EvaluationResult | undefined> {
+	if (typeof response.message.content !== 'string') return;
+	const evaluator = new RelevancyEvaluator();
+	const contexts: string[] = [];
+	for (const node of nodes) {
+		if (typeof node.node.getContent === 'function') {
+			contexts.push(node.node.getContent(MetadataMode.ALL));
+		} else {
+			const tn = node.node as TextNode;
+			if (typeof tn.text === 'string') contexts.push(tn.text);
+		}
+	}
+	const relevance: EvaluationResult = await evaluator.evaluate({
+		query,
+		response: response.message.content,
+		contexts
+	});
+	return relevance;
+}
+
+async function getAnswer(query: string, response: EngineResponse, nodes: NodeWithScore<Metadata>[]): Promise<Answer | undefined> {
+	if (typeof response.message.content !== 'string') return;
+
+	const relevance = await evaluateResponse(query, response, nodes);
+	if (relevance === undefined) return;
+
+	const answer: Answer = {
+		answer: response.message.content,
+		sources: [],
+		relevance: {
+			passing: relevance.passing,
+			score: relevance.score,
+		},
+	}
+
+	if (!Array.isArray(nodes) || nodes.length < 1) return answer;
+	for (const node of nodes) {
+		if (node.score === undefined || node.score < .5) continue;
+		if (node.node.relationships.SOURCE === undefined) continue;
+		const source = node.node.relationships.SOURCE as RelatedNodeInfo;
+		if (source.nodeId === undefined) continue;
+		if (answer.sources.some(e => e.url === source.nodeId)) continue;
+		answer.sources.push({ url: source.nodeId, title: source.metadata.title });
+	}
+
+	return answer;
+}
+
+export async function query({ query }: { query: string }): Promise<Answer | undefined> {
+	const queryEngine = await getQueryEngine();
+	const response = await queryEngine.query({ query });
+	if (typeof response.message.content !== 'string') return;
+	if (!Array.isArray(response.sourceNodes) || response.sourceNodes.length < 1) return;
+	const answer = await getAnswer(query, response, response.sourceNodes);
+	return answer;
+}
+
+export async function queryAgent({ query }: { query: string }): Promise<Answer | undefined> {
+	console.debug(`Querying agent with: ${query}`);
+	const agent = await getAgent();
+	const callbackManager = new CallbackManager(); // https://github.com/run-llama/LlamaIndexTS/issues/1015
+	const sourceNodes = new Promise<NodeWithScore<Metadata>[]>((res) => { // Let's just get out of callback-land as quickly as possible eh?
+		callbackManager.on('retrieve-end', (data) => {
+			res(data.detail?.nodes ?? []);
+		});
+	});
+	const response = await Settings.withCallbackManager(callbackManager, () => {
+		return agent.chat({ message: query });
+	});
+	callbackManager.dispatchEvent('retrieve-end', { query, nodes: [] }); // If response came from function tool, this event will not have fired, so we'll force it here; otherwise this will be ignored since retrieval will already have happened
+	console.debug(`Agent responded with: ${response}`, typeof response, JSON.stringify(response));
+	if (typeof response.message.content !== 'string') return;
+	const nodes = await sourceNodes;
+	const answer = await getAnswer(query, response, nodes);
+	return answer;
+}
+
+export async function resetCollection(): Promise<any> {
+	const client = vectorStore.client();
+	await client.deleteCollection(config.qdrantCollection);
+	await vectorStore.initializeCollection(config.embeddingDimension);
+}
+
+export async function ingestText(text: string, id: string): Promise<void> {
+	const document = new Document({
+		text: text,
+		id_: id,
+		metadata: { title: id },
+	});
+	const pipeline = new IngestionPipeline({
+		transformations: [
+			new SimpleNodeParser(),
+			Settings.embedModel,
+		],
+		vectorStore,
+	});
+	const nodes = await pipeline.run({ documents: [document] });
+}
+
+export default {
+	init,
+	resetCollection,
+	query,
+	queryAgent,
+	ingestText,
+};
\ No newline at end of file
--- a/src/lib/config.ts
+++ b/src/lib/config.ts
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+export default {
+	embeddingModel: process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-small',
+	embeddingDimension: parseInt(process.env.EMBEDDING_DIMENSION ?? '1536', 10),
+	httpPort: process.env.HTTP_PORT ?? 3000,
+	llm: 'gpt-3.5-turbo-0125',
+	qdrantCollection: 'synchronet',
+	qdrantUrl: `http://${process.env.QDRANT_HOST ?? 'qdrant'}:${process.env.QDRANT_PORT ?? '6333'}`,
+}
\ No newline at end of file
--- a/src/lib/wiki.ts
+++ b/src/lib/wiki.ts
+import fs from 'fs';
+import path from 'path';
+import puppeteer from 'puppeteer';
+import { ingestText, resetCollection } from './ai.js';
+
+const BASE_URL = 'https://wiki.synchro.net';
+const OUT_FILE = path.join(path.resolve(), 'data', 'wiki.json');
+
+export async function extractData(): Promise<void> {
+
+	if (fs.existsSync(OUT_FILE)) {
+		const { mtime } = fs.statSync(OUT_FILE);
+		if ((new Date()).getTime() - mtime.getTime() < (1000 * 60 * 60 * 24)) return;
+	}
+
+	const browser = await puppeteer.launch({
+		executablePath: '/usr/bin/google-chrome',
+		args: [
+			'--no-sandbox',
+		],
+	});
+
+	const page = await browser.newPage();
+	await page.goto(`${BASE_URL}/wiki:site_index`);
+	await page.setViewport({ width: 1080, height: 1024 });
+	const paths = await page.evaluate(() => {
+		const elements = document.querySelector('div > div.indexmenu_nojs > ul[role="tree"].idx').getElementsByTagName('a');
+		if (elements === undefined || elements === null) return;
+		const paths: string[] = [];
+		for (const element of elements) {
+			paths.push(element.getAttribute('href'));
+		}
+		return paths;
+	});
+
+	const pages: Record<string, string> = {}; // string[] = [];
+	for (const p of paths) {
+		if (p.search(/^\/es:/) === 0) continue;
+		if (p.search(/^\/wiki:/) === 0) continue;
+		console.debug(p);
+		const page = await browser.newPage();
+		await page.goto(`${BASE_URL}${p}`);
+		await page.setViewport({ width: 1080, height: 1024 });
+		const content = await page.evaluate(() => {
+			const body = document.getElementById('bodyContent')?.innerHTML;
+			if (body === undefined || body === null) return;
+			const match = body.match(/<!-- start rendered wiki content -->([\s\S]*?)<!-- end rendered wiki content -->/);
+			if (match === null) return;
+			return match[1];
+		});
+		if (content === undefined) continue;
+		pages[p] = content;
+	}
+
+	fs.writeFileSync(OUT_FILE, JSON.stringify(pages));
+
+	await browser.close();
+
+}
+
+export async function embedData(): Promise<void> {
+	await extractData();
+	await resetCollection();
+	const wiki: Record<string, string> = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
+	for (const page in wiki) {
+		await ingestText(wiki[page], `${BASE_URL}page`);
+	}
+}
\ No newline at end of file
--- a/tsconfig.json
+++ b/tsconfig.json
+{
+	"compilerOptions": {
+	  "module": "ESNext",
+	  "esModuleInterop": true,
+	  "target": "ESNext",
+	  "moduleResolution": "Bundler",
+	  "sourceMap": true,
+	  "outDir": "build",
+	  "skipLibCheck": true
+	},
+	"include": ["src/**/*"],
+	"exclude": ["node_modules"]
+  }
\ No newline at end of file