|
5 | 5 | package core |
6 | 6 |
|
7 | 7 | import ( |
| 8 | + "fmt" |
8 | 9 | "strings" |
9 | 10 | "time" |
| 11 | + "unsafe" |
10 | 12 | ) |
11 | 13 |
|
12 | 14 | type RichLabel struct { |
@@ -41,8 +43,10 @@ type Document struct { |
41 | 43 | Title string `json:"title,omitempty" elastic_mapping:"title:{type:text,copy_to:combined_fulltext,fields:{keyword: {type: keyword}, pinyin: {type: text, analyzer: pinyin_analyzer}}}"` // Document title |
42 | 44 | Summary string `json:"summary,omitempty" elastic_mapping:"summary:{type:text,copy_to:combined_fulltext}"` // Brief summary or description of the document |
43 | 45 |
|
44 | | - Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr") |
45 | | - Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing |
| 46 | + Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr") |
| 47 | + Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing |
| 48 | + Text []PageText `json:"text,omitempty" elastic_mapping:"text:{type:nested}"` // Document content in text for full-text indexing |
| 49 | + Embedding []Embedding `json:"embedding,omitempty" elastic_mapping:"embedding:{type:nested}"` |
46 | 50 |
|
47 | 51 | Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Icon Key, need work with datasource's assets to get the icon url, if it is a full url, then use it directly |
48 | 52 | Thumbnail string `json:"thumbnail,omitempty" elastic_mapping:"thumbnail:{enabled:false}"` // Thumbnail image URL, for preview purposes |
@@ -113,3 +117,149 @@ type UserInfo struct { |
113 | 117 | UserName string `json:"username,omitempty" elastic_mapping:"username:{type:keyword,copy_to:combined_fulltext}"` // Login of the user |
114 | 118 | UserID string `json:"userid,omitempty" elastic_mapping:"userid:{type:keyword,copy_to:combined_fulltext}"` // Unique identifier for the user |
115 | 119 | } |
| 120 | + |
| 121 | +type PageText struct { |
| 122 | + PageNumber int `json:"page_number" elastic_mapping:"page_number:{type:integer}"` |
| 123 | + Content string `json:"content" elastic_mapping:"content:{type:text,analyzer:combined_text_analyzer}"` |
| 124 | +} |
| 125 | + |
| 126 | +type Embedding struct { |
| 127 | + ModelProvider string `json:"model_provider" elastic_mapping:"model_provider:{type:keyword}"` |
| 128 | + Model string `json:"model" elastic_mapping:"model:{type:keyword}"` |
| 129 | + EmbeddingDimension int32 `json:"embedding_dimension" elastic_mapping:"embedding_dimension:{type:integer}"` |
| 130 | + |
| 131 | + /* |
| 132 | + A document will be split into chunks. An `EmbeddingXxx` field will be used |
| 133 | + to store these chunks' embeddings. |
| 134 | +
|
| 135 | + Only 1 field will be used, depending on the chosen embedding dimension, see |
| 136 | + the `EmbeddingDimension` field above. |
| 137 | +
|
| 138 | + Having so many `EmbeddingXxx` fields is embarrasing, but we have no choice |
| 139 | + since vector dimension is part of the type information and elastic mapping |
| 140 | + has to be static. |
| 141 | + */ |
| 142 | + Embeddings128 []ChunkEmbedding128 `json:"embeddings128" elastic_mapping:"embeddings128:{type:nested}"` |
| 143 | + Embeddings256 []ChunkEmbedding256 `json:"embeddings256" elastic_mapping:"embeddings256:{type:nested}"` |
| 144 | + Embeddings384 []ChunkEmbedding384 `json:"embeddings384" elastic_mapping:"embeddings384:{type:nested}"` |
| 145 | + Embeddings512 []ChunkEmbedding512 `json:"embeddings512" elastic_mapping:"embeddings512:{type:nested}"` |
| 146 | + Embeddings768 []ChunkEmbedding768 `json:"embeddings768" elastic_mapping:"embeddings768:{type:nested}"` |
| 147 | + Embeddings1024 []ChunkEmbedding1024 `json:"embeddings1024" elastic_mapping:"embeddings1024:{type:nested}"` |
| 148 | + Embeddings1536 []ChunkEmbedding1536 `json:"embeddings1536" elastic_mapping:"embeddings1536:{type:nested}"` |
| 149 | + Embeddings2048 []ChunkEmbedding2048 `json:"embeddings2048" elastic_mapping:"embeddings2048:{type:nested}"` |
| 150 | + Embeddings2560 []ChunkEmbedding2560 `json:"embeddings2560" elastic_mapping:"embeddings2560:{type:nested}"` |
| 151 | + Embeddings4096 []ChunkEmbedding4096 `json:"embeddings4096" elastic_mapping:"embeddings4096:{type:nested}"` |
| 152 | +} |
| 153 | + |
| 154 | +// Set the `EmbeddingsXxx` field using the value provided by `chunkEmbeddings`. |
| 155 | +// |
| 156 | +// # Panic |
| 157 | +// |
| 158 | +// Field `EmbeddingDimension` should be set before calling this function, or it |
| 159 | +// panics. |
| 160 | +func (e *Embedding) SetEmbeddings(chunkEmbeddings []ChunkEmbedding) { |
| 161 | + if e.EmbeddingDimension == 0 { |
| 162 | + panic("Embedding.EmbeddingDimension is not set (value: 0), don't know which field to set") |
| 163 | + } |
| 164 | + |
| 165 | + // ChunkEmbedding and other ChunkEmbeddingXxx types have the same memory |
| 166 | + // representation so the cast is safe here. |
| 167 | + switch e.EmbeddingDimension { |
| 168 | + case 128: |
| 169 | + e.Embeddings128 = *(*[]ChunkEmbedding128)(unsafe.Pointer(&chunkEmbeddings)) |
| 170 | + case 256: |
| 171 | + e.Embeddings256 = *(*[]ChunkEmbedding256)(unsafe.Pointer(&chunkEmbeddings)) |
| 172 | + case 384: |
| 173 | + e.Embeddings384 = *(*[]ChunkEmbedding384)(unsafe.Pointer(&chunkEmbeddings)) |
| 174 | + case 512: |
| 175 | + e.Embeddings512 = *(*[]ChunkEmbedding512)(unsafe.Pointer(&chunkEmbeddings)) |
| 176 | + case 768: |
| 177 | + e.Embeddings768 = *(*[]ChunkEmbedding768)(unsafe.Pointer(&chunkEmbeddings)) |
| 178 | + case 1024: |
| 179 | + e.Embeddings1024 = *(*[]ChunkEmbedding1024)(unsafe.Pointer(&chunkEmbeddings)) |
| 180 | + case 1536: |
| 181 | + e.Embeddings1536 = *(*[]ChunkEmbedding1536)(unsafe.Pointer(&chunkEmbeddings)) |
| 182 | + case 2048: |
| 183 | + e.Embeddings2048 = *(*[]ChunkEmbedding2048)(unsafe.Pointer(&chunkEmbeddings)) |
| 184 | + case 2560: |
| 185 | + e.Embeddings2560 = *(*[]ChunkEmbedding2560)(unsafe.Pointer(&chunkEmbeddings)) |
| 186 | + case 4096: |
| 187 | + e.Embeddings4096 = *(*[]ChunkEmbedding4096)(unsafe.Pointer(&chunkEmbeddings)) |
| 188 | + default: |
| 189 | + panic(fmt.Sprintf("unsupported embedding dimension: %d\n", e.EmbeddingDimension)) |
| 190 | + } |
| 191 | +} |
| 192 | + |
| 193 | +// Range of this chunk. |
| 194 | +// |
| 195 | +// A chunk contains roughly the same amount of tokens, say 8192 tokens. And |
| 196 | +// thus, a chunk can span many pages if these pages are small, or it is only |
| 197 | +// part of a page if it is big. |
| 198 | +// |
| 199 | +// In the later case, `Start` and `End` will be in format "<page num>-<sub-page num>" |
| 200 | +// that "<sub-page num>" specifies the part of that page. |
| 201 | +type ChunkRange struct { |
| 202 | + // Start page of this chunk. |
| 203 | + Start int `json:"start" elastic_mapping:"start:{type:integer}"` |
| 204 | + // End page of this chuhk. This is **inclusive**. |
| 205 | + End int `json:"end" elastic_mapping:"end:{type:integer}"` |
| 206 | +} |
| 207 | + |
| 208 | +// A `ChunkEmbedding` definition without any tag information. |
| 209 | +// |
| 210 | +// It should have the same memory representation as other `ChunkEmbeddingXxx` |
| 211 | +// variants. |
| 212 | +type ChunkEmbedding struct { |
| 213 | + Range ChunkRange |
| 214 | + Embedding []float32 |
| 215 | +} |
| 216 | + |
| 217 | +type ChunkEmbedding128 struct { |
| 218 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 219 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:128}}"` |
| 220 | +} |
| 221 | + |
| 222 | +type ChunkEmbedding256 struct { |
| 223 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 224 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:256}}"` |
| 225 | +} |
| 226 | + |
| 227 | +type ChunkEmbedding384 struct { |
| 228 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 229 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:384}}"` |
| 230 | +} |
| 231 | + |
| 232 | +type ChunkEmbedding512 struct { |
| 233 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 234 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:512}}"` |
| 235 | +} |
| 236 | + |
| 237 | +type ChunkEmbedding768 struct { |
| 238 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 239 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:768}}"` |
| 240 | +} |
| 241 | + |
| 242 | +type ChunkEmbedding1024 struct { |
| 243 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 244 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:1024}}"` |
| 245 | +} |
| 246 | + |
| 247 | +type ChunkEmbedding1536 struct { |
| 248 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 249 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:1536}}"` |
| 250 | +} |
| 251 | + |
| 252 | +type ChunkEmbedding2048 struct { |
| 253 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 254 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:2048}}"` |
| 255 | +} |
| 256 | + |
| 257 | +type ChunkEmbedding2560 struct { |
| 258 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 259 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:2560}}"` |
| 260 | +} |
| 261 | + |
| 262 | +type ChunkEmbedding4096 struct { |
| 263 | + Range ChunkRange `json:"page_range" elastic_mapping:"page_range:{type:object}"` |
| 264 | + Embedding []float32 `json:"embedding" elastic_mapping:"embedding:{type:knn_dense_float_vector,knn:{dims:4096}}"` |
| 265 | +} |
0 commit comments