Skip to content

Commit 6372f14

Browse files
committed
feat: processors for enriching documents
1 parent dbbb299 commit 6372f14

File tree

9 files changed

+1098
-132
lines changed

9 files changed

+1098
-132
lines changed

coco.yml

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
env:
2-
ES_ENDPOINT: https://localhost:9200
2+
ES_ENDPOINT: http://192.168.3.119:9200
33
ES_USERNAME: admin
44
ES_PASSWORD: $[[keystore.ES_PASSWORD]]
55
WEB_BINDING: 0.0.0.0:9000
@@ -311,7 +311,7 @@ web:
311311
##background jobs
312312
pipeline:
313313
- name: enrich_documents
314-
auto_start: false
314+
auto_start: true
315315
keep_running: true
316316
processor:
317317
- consumer:
@@ -323,21 +323,33 @@ pipeline:
323323
group: enriched_documents
324324
fetch_max_messages: 10
325325
processor:
326+
- read_file_content: {}
327+
- extract_file_text:
328+
tika_endpoint: "http://127.0.0.1:9998"
329+
timeout_in_seconds: 180
326330
- document_summarization:
327-
model: $[[env.ENRICHMENT_MODEL]]
328-
input_queue: "indexing_documents"
329-
min_input_document_length: 500
331+
model_provider: deepseek
332+
model: deepseek-chat
333+
model_context_length: 128000
334+
max_summary_length: 300
335+
- document_embedding:
336+
model_provider: qianwen
337+
model: text-embedding-v4
338+
# Dimension of the vectors generated by your embedding model
339+
embedding_dimension: 1024
340+
# Chunk size in characters, it is recommended to use a value
341+
# lower than you embedding model's input limit, e.g., if the
342+
# limit is 8192 tokens, then we can set chunk size to 7000.
343+
chunk_size: 7000
330344
output_queue:
331345
name: "enriched_documents"
332-
label:
333-
tag: "enriched"
334346

335347
- name: merge_documents
336348
auto_start: true
337349
keep_running: true
338350
processor:
339351
- indexing_merge:
340-
input_queue: "indexing_documents"
352+
input_queue: "enriched_documents"
341353
idle_timeout_in_seconds: 1
342354
elasticsearch: "prod"
343355
index_name: "coco_document-v2"
@@ -362,6 +374,7 @@ pipeline:
362374
queues:
363375
type: indexing_merge
364376
tag: "merged"
377+
365378
- name: connector_dispatcher
366379
auto_start: true
367380
keep_running: true

core/document.go

Lines changed: 152 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
package core
66

77
import (
8+
"fmt"
89
"strings"
910
"time"
11+
"unsafe"
1012
)
1113

1214
type RichLabel struct {
@@ -41,8 +43,10 @@ type Document struct {
4143
Title string `json:"title,omitempty" elastic_mapping:"title:{type:text,copy_to:combined_fulltext,fields:{keyword: {type: keyword}, pinyin: {type: text, analyzer: pinyin_analyzer}}}"` // Document title
4244
Summary string `json:"summary,omitempty" elastic_mapping:"summary:{type:text,copy_to:combined_fulltext}"` // Brief summary or description of the document
4345

44-
Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr")
45-
Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing
46+
Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr")
47+
Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing
48+
Text []PageText `json:"text,omitempty" elastic_mapping:"text:{type:nested}"` // Document content in text for full-text indexing
49+
Embedding []Embedding `json:"embedding,omitempty" elastic_mapping:"embedding:{type:nested}"`
4650

4751
Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Icon Key, need work with datasource's assets to get the icon url, if it is a full url, then use it directly
4852
Thumbnail string `json:"thumbnail,omitempty" elastic_mapping:"thumbnail:{enabled:false}"` // Thumbnail image URL, for preview purposes
@@ -113,3 +117,149 @@ type UserInfo struct {
113117
UserName string `json:"username,omitempty" elastic_mapping:"username:{type:keyword,copy_to:combined_fulltext}"` // Login of the user
114118
UserID string `json:"userid,omitempty" elastic_mapping:"userid:{type:keyword,copy_to:combined_fulltext}"` // Unique identifier for the user
115119
}
120+
121+
type PageText struct {
122+
PageNumber int `json:"page_number" elastic_mapping:"page_number:{type:integer}"`
123+
Content string `json:"content" elastic_mapping:"content:{type:text,analyzer:combined_text_analyzer}"`
124+
}
125+
126+
type Embedding struct {
127+
ModelProvider string `json:"model_provider" elastic_mapping:"model_provider:{type:keyword}"`
128+
Model string `json:"model" elastic_mapping:"model:{type:keyword}"`
129+
EmbeddingDimension int32 `json:"embedding_dimension" elastic_mapping:"embedding_dimension:{type:integer}"`
130+
131+
/*
132+
A document will be split into chunks. An `EmbeddingXxx` field will be used
133+
to store these chunks' embeddings.
134+
135+
Only 1 field will be used, depending on the chosen embedding dimension, see
136+
the `EmbeddingDimension` field above.
137+
138+
Having so many `EmbeddingXxx` fields is embarrasing, but we have no choice
139+
since vector dimension is part of the type information and elastic mapping
140+
has to be static.
141+
*/
142+
Embeddings128 []ChunkEmbedding128 `json:"embeddings128" elastic_mapping:"embeddings128:{type:nested}"`
143+
Embeddings256 []ChunkEmbedding256 `json:"embeddings256" elastic_mapping:"embeddings256:{type:nested}"`
144+
Embeddings384 []ChunkEmbedding384 `json:"embeddings384" elastic_mapping:"embeddings384:{type:nested}"`
145+
Embeddings512 []ChunkEmbedding512 `json:"embeddings512" elastic_mapping:"embeddings512:{type:nested}"`
146+
Embeddings768 []ChunkEmbedding768 `json:"embeddings768" elastic_mapping:"embeddings768:{type:nested}"`
147+
Embeddings1024 []ChunkEmbedding1024 `json:"embeddings1024" elastic_mapping:"embeddings1024:{type:nested}"`
148+
Embeddings1536 []ChunkEmbedding1536 `json:"embeddings1536" elastic_mapping:"embeddings1536:{type:nested}"`
149+
Embeddings2048 []ChunkEmbedding2048 `json:"embeddings2048" elastic_mapping:"embeddings2048:{type:nested}"`
150+
Embeddings2560 []ChunkEmbedding2560 `json:"embeddings2560" elastic_mapping:"embeddings2560:{type:nested}"`
151+
Embeddings4096 []ChunkEmbedding4096 `json:"embeddings4096" elastic_mapping:"embeddings4096:{type:nested}"`
152+
}
153+
154+
// Set the `EmbeddingsXxx` field using the value provided by `chunkEmbeddings`.
155+
//
156+
// # Panic
157+
//
158+
// Field `EmbeddingDimension` should be set before calling this function, or it
159+
// panics.
160+
func (e *Embedding) SetEmbeddings(chunkEmbeddings []ChunkEmbedding) {
161+
if e.EmbeddingDimension == 0 {
162+
panic("Embedding.EmbeddingDimension is not set (value: 0), don't know which field to set")
163+
}
164+
165+
// ChunkEmbedding and other ChunkEmbeddingXxx types have the same memory
166+
// representation so the cast is safe here.
167+
switch e.EmbeddingDimension {
168+
case 128:
169+
e.Embeddings128 = *(*[]ChunkEmbedding128)(unsafe.Pointer(&chunkEmbeddings))
170+
case 256:
171+
e.Embeddings256 = *(*[]ChunkEmbedding256)(unsafe.Pointer(&chunkEmbeddings))
172+
case 384:
173+
e.Embeddings384 = *(*[]ChunkEmbedding384)(unsafe.Pointer(&chunkEmbeddings))
174+
case 512:
175+
e.Embeddings512 = *(*[]ChunkEmbedding512)(unsafe.Pointer(&chunkEmbeddings))
176+
case 768:
177+
e.Embeddings768 = *(*[]ChunkEmbedding768)(unsafe.Pointer(&chunkEmbeddings))
178+
case 1024:
179+
e.Embeddings1024 = *(*[]ChunkEmbedding1024)(unsafe.Pointer(&chunkEmbeddings))
180+
case 1536:
181+
e.Embeddings1536 = *(*[]ChunkEmbedding1536)(unsafe.Pointer(&chunkEmbeddings))
182+
case 2048:
183+
e.Embeddings2048 = *(*[]ChunkEmbedding2048)(unsafe.Pointer(&chunkEmbeddings))
184+
case 2560:
185+
e.Embeddings2560 = *(*[]ChunkEmbedding2560)(unsafe.Pointer(&chunkEmbeddings))
186+
case 4096:
187+
e.Embeddings4096 = *(*[]ChunkEmbedding4096)(unsafe.Pointer(&chunkEmbeddings))
188+
default:
189+
panic(fmt.Sprintf("unsupported embedding dimension: %d\n", e.EmbeddingDimension))
190+
}
191+
}
192+
193+
// Range of this chunk.
194+
//
195+
// A chunk contains roughly the same amount of tokens, say 8192 tokens. And
196+
// thus, a chunk can span many pages if these pages are small, or it is only
197+
// part of a page if it is big.
198+
//
199+
// In the later case, `Start` and `End` will be in format "<page num>-<sub-page num>"
200+
// that "<sub-page num>" specifies the part of that page.
201+
type ChunkRange struct {
202+
// Start page of this chunk.
203+
Start int `json:"start" elastic_mapping:"start:{type:integer}"`
204+
// End page of this chuhk. This is **inclusive**.
205+
End int `json:"end" elastic_mapping:"end:{type:integer}"`
206+
}
207+
208+
// A `ChunkEmbedding` definition without any tag information.
209+
//
210+
// It should have the same memory representation as other `ChunkEmbeddingXxx`
211+
// variants.
212+
type ChunkEmbedding struct {
213+
Range ChunkRange
214+
Embedding []float32
215+
}
216+
217+
type ChunkEmbedding128 struct {
218+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
219+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:128}}"`
220+
}
221+
222+
type ChunkEmbedding256 struct {
223+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
224+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:256}}"`
225+
}
226+
227+
type ChunkEmbedding384 struct {
228+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
229+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:384}}"`
230+
}
231+
232+
type ChunkEmbedding512 struct {
233+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
234+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:512}}"`
235+
}
236+
237+
type ChunkEmbedding768 struct {
238+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
239+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:768}}"`
240+
}
241+
242+
type ChunkEmbedding1024 struct {
243+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
244+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:1024}}"`
245+
}
246+
247+
type ChunkEmbedding1536 struct {
248+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
249+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:1536}}"`
250+
}
251+
252+
type ChunkEmbedding2048 struct {
253+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
254+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:2048}}"`
255+
}
256+
257+
type ChunkEmbedding2560 struct {
258+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
259+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:2560}}"`
260+
}
261+
262+
type ChunkEmbedding4096 struct {
263+
Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"`
264+
Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:4096}}"`
265+
}

modules/assistant/langchain/llm.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@
55
package langchain
66

77
import (
8+
"net/http"
9+
"net/http/httputil"
10+
811
log "github.com/cihub/seelog"
912
"github.com/tmc/langchaingo/llms"
1013
"github.com/tmc/langchaingo/llms/ollama"
1114
"github.com/tmc/langchaingo/llms/openai"
1215
"infini.sh/coco/core"
1316
"infini.sh/coco/modules/common"
1417
"infini.sh/framework/core/global"
15-
"net/http"
16-
"net/http/httputil"
1718
)
1819

1920
type LoggingRoundTripper struct {
@@ -61,12 +62,14 @@ func GetLLM(endpoint, apiType, model, token string, keepalive string) llms.Model
6162
openai.WithToken(token),
6263
openai.WithBaseURL(endpoint),
6364
openai.WithModel(model),
65+
openai.WithEmbeddingModel(model),
6466
)
6567
} else {
6668
llm, err = openai.New(
6769
openai.WithToken(token),
6870
openai.WithBaseURL(endpoint),
6971
openai.WithModel(model),
72+
openai.WithEmbeddingModel(model),
7073
)
7174
}
7275

plugins/connectors/local_fs/plugin.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,10 @@ func (p *Plugin) saveDocument(ctx *pipeline.Context, currentPath, basePath strin
154154
Source: core.DataSourceReference{ID: datasource.ID, Type: "connector", Name: datasource.Name},
155155
Type: connectors.TypeFile,
156156
Category: filepath.Dir(currentPath),
157-
Content: "", // skip content
158-
URL: currentPath,
159-
Size: int(fileInfo.Size()),
157+
// skip content here, which will be popluated by the `read_file_content` processor
158+
Content: "",
159+
URL: currentPath,
160+
Size: int(fileInfo.Size()),
160161
}
161162
doc.System = datasource.System
162163
if doc.System == nil {

0 commit comments

Comments
 (0)