|
5 | 5 | package core |
6 | 6 |
|
7 | 7 | import ( |
| 8 | + "fmt" |
8 | 9 | "strings" |
9 | 10 | "time" |
10 | 11 | ) |
@@ -41,8 +42,9 @@ type Document struct { |
41 | 42 | Title string `json:"title,omitempty" elastic_mapping:"title:{type:text,copy_to:combined_fulltext,fields:{keyword: {type: keyword}, pinyin: {type: text, analyzer: pinyin_analyzer}}}"` // Document title |
42 | 43 | Summary string `json:"summary,omitempty" elastic_mapping:"summary:{type:text,copy_to:combined_fulltext}"` // Brief summary or description of the document |
43 | 44 |
|
44 | | - Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr") |
45 | | - Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing |
| 45 | + Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr") |
| 46 | + Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing |
| 47 | + Chunks []DocumentChunk `json:"document_chunk,omitempty" elastic_mapping:"document_chunk:{type:nested}"` |
46 | 48 |
|
47 | 49 | Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Icon Key, need work with datasource's assets to get the icon url, if it is a full url, then use it directly |
48 | 50 | Thumbnail string `json:"thumbnail,omitempty" elastic_mapping:"thumbnail:{enabled:false}"` // Thumbnail image URL, for preview purposes |
@@ -113,3 +115,78 @@ type UserInfo struct { |
113 | 115 | UserName string `json:"username,omitempty" elastic_mapping:"username:{type:keyword,copy_to:combined_fulltext}"` // Login of the user |
114 | 116 | UserID string `json:"userid,omitempty" elastic_mapping:"userid:{type:keyword,copy_to:combined_fulltext}"` // Unique identifier for the user |
115 | 117 | } |
| 118 | + |
| 119 | +type DocumentChunk struct { |
| 120 | + Range ChunkRange `json:"range" elastic_mapping:"range:{type:object}"` |
| 121 | + Text string `json:"text" elastic_mapping:"text:{type:text}"` |
| 122 | + Embedding Embedding `json:"embedding" elastic_mapping:"embedding:{type:object}"` |
| 123 | +} |
| 124 | + |
| 125 | +// A `Embedding` stores a chunk's embedding. |
| 126 | +// |
| 127 | +// Only 1 field will be used, depending on the chosen embedding dimension, see |
| 128 | +// the `Dimension` field above. |
| 129 | +// |
| 130 | +// Having so many `EmbeddingXxx` fields is embarrasing, but we have no choice |
| 131 | +// since vector dimension is part of the type information and elastic mapping |
| 132 | +// has to be static. |
| 133 | +// |
| 134 | +// If you add or remove fields, please update variable "SupportedEmbeddingDimensions" |
| 135 | +// as well. |
| 136 | +type Embedding struct { |
| 137 | + Embedding128 []float32 `json:"embedding128,omitempty" elastic_mapping:"embedding128:{type:knn_dense_float_vector,knn:{dims:128,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 138 | + Embedding256 []float32 `json:"embedding256,omitempty" elastic_mapping:"embedding256:{type:knn_dense_float_vector,knn:{dims:256,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 139 | + Embedding384 []float32 `json:"embedding384,omitempty" elastic_mapping:"embedding384:{type:knn_dense_float_vector,knn:{dims:384,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 140 | + Embedding512 []float32 `json:"embedding512,omitempty" elastic_mapping:"embedding512:{type:knn_dense_float_vector,knn:{dims:512,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 141 | + Embedding768 []float32 `json:"embedding768,omitempty" elastic_mapping:"embedding768:{type:knn_dense_float_vector,knn:{dims:768,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 142 | + Embedding1024 []float32 `json:"embedding1024,omitempty" elastic_mapping:"embedding1024:{type:knn_dense_float_vector,knn:{dims:1024,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 143 | + Embedding1536 []float32 `json:"embedding1536,omitempty" elastic_mapping:"embedding1536:{type:knn_dense_float_vector,knn:{dims:1536,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 144 | + Embedding2048 []float32 `json:"embedding2048,omitempty" elastic_mapping:"embedding2048:{type:knn_dense_float_vector,knn:{dims:2048,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 145 | + Embedding2560 []float32 `json:"embedding2560,omitempty" elastic_mapping:"embedding2560:{type:knn_dense_float_vector,knn:{dims:2560,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 146 | + Embedding4096 []float32 `json:"embedding4096,omitempty" elastic_mapping:"embedding4096:{type:knn_dense_float_vector,knn:{dims:4096,model:lsh,similarity:cosine,L:99,k:1}}"` |
| 147 | +} |
| 148 | + |
| 149 | +// Set the actual value of this "Embedding" |
| 150 | +func (e *Embedding) SetValue(embedding []float32) { |
| 151 | + dimension := len(embedding) |
| 152 | + switch dimension { |
| 153 | + case 128: |
| 154 | + e.Embedding128 = embedding |
| 155 | + case 256: |
| 156 | + e.Embedding256 = embedding |
| 157 | + case 384: |
| 158 | + e.Embedding384 = embedding |
| 159 | + case 512: |
| 160 | + e.Embedding512 = embedding |
| 161 | + case 768: |
| 162 | + e.Embedding768 = embedding |
| 163 | + case 1024: |
| 164 | + e.Embedding1024 = embedding |
| 165 | + case 1536: |
| 166 | + e.Embedding1536 = embedding |
| 167 | + case 2048: |
| 168 | + e.Embedding2048 = embedding |
| 169 | + case 2560: |
| 170 | + e.Embedding2560 = embedding |
| 171 | + case 4096: |
| 172 | + e.Embedding4096 = embedding |
| 173 | + default: |
| 174 | + panic(fmt.Sprintf("embedding's dimension is invalid, we accept %v", SupportedEmbeddingDimensions)) |
| 175 | + } |
| 176 | +} |
| 177 | + |
| 178 | +// Embedding dimensions supported by us, it should be kept sync with the |
| 179 | +// "EmbeddingXxx" fields of struct Embedding |
| 180 | +var SupportedEmbeddingDimensions = []int32{128, 256, 384, 512, 768, 1024, 1536, 2048, 2560, 4096} |
| 181 | + |
| 182 | +// Range of a chunk. |
| 183 | +// |
| 184 | +// A chunk contains roughly the same amount of tokens, say 8192 tokens. And |
| 185 | +// thus, a chunk can span many pages if these pages are small, or it is only |
| 186 | +// part of a page if the page is big. |
| 187 | +type ChunkRange struct { |
| 188 | + // Start page of this chunk. |
| 189 | + Start int `json:"start" elastic_mapping:"start:{type:integer}"` |
| 190 | + // End page of this chuhk. This is **inclusive**. |
| 191 | + End int `json:"end" elastic_mapping:"end:{type:integer}"` |
| 192 | +} |
0 commit comments