File size: 2,885 Bytes
872630d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import { pipeline } from "@xenova/transformers";

const CHUNK_SIZE = 1000;

export class SimpleVectorStore {
  constructor() {
      this.documents = [];
      this.embeddings = [];
  }

  addDocument(embedding, document) {
    this.embeddings.push(embedding);
    this.documents.push(document);
  }


  async similaritySearch(queryEmbedding, topK) {
      let scores = this.embeddings.map((emb, index) => ({
          score: cosineSimilarity(emb, queryEmbedding),
          index: index
      }));

      scores.sort((a, b) => b.score - a.score);

      return scores.slice(0, topK).map(score => ({
          document: this.documents[score.index],
          score: score.score
      }));
  }
}

export function cosineSimilarity(vecA, vecB) {
  const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
  const magA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
  const magB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
  return dotProduct / (magA * magB);
}

class EmbeddingsWorker {
  constructor(modelName = "Xenova/all-MiniLM-L6-v2") {
      this.modelName = modelName;
      this.client = null;
      this.vectorStore = new SimpleVectorStore();
  }

  async loadClient() {
      if (!this.client) {
          this.client = await pipeline("feature-extraction", this.modelName);
      }
  }

  async _embed(texts) {
      await this.loadClient();
      const embedResults = await Promise.all(
          texts.map(async (text) => {
              const response = await this.client(text, {
                  pooling: "mean", 
                  normalize: true 
              });
              return response.data;
          })
      );
      return embedResults;
  }

  async addDocumentsToStore(docs, chunkSize = 1000) {
    for (const doc of docs) {
      const chunks = this.chunkText(doc, chunkSize);
      const embeddings = await this._embed(chunks);
      embeddings.forEach((embedding, index) => {
        this.vectorStore.addDocument(embedding, chunks[index]);
      });
    }
  }

  chunkText(text, size) {
    const chunks = [];
    for (let i = 0; i < text.length; i += size) {
      chunks.push(text.substring(i, i + size));
    }
    return chunks;
  }

  async searchSimilarDocuments(query, topK) {
    const queryEmbedding = await this._embed([query]);
    return this.vectorStore.similaritySearch(queryEmbedding[0], topK);
  }
}

const worker = new EmbeddingsWorker();

self.addEventListener('message', async (event) => {
  if (event.data.action === 'addDocumentsToStore') {
    await worker.addDocumentsToStore(event.data.documents, CHUNK_SIZE);
    self.postMessage({ action: 'documentsAdded' });
  } else if (event.data.action === 'searchSimilarDocuments') {
    const results = await worker.searchSimilarDocuments(event.data.query, event.data.topK);
    self.postMessage({ action: 'searchResults', results });
  }
});