sql/004_search_index.sql
1 DROP VIEW IF EXISTS moraine.mv_search_documents_from_normalized;
2 DROP VIEW IF EXISTS moraine.mv_search_documents_from_compacted;
3 DROP VIEW IF EXISTS moraine.mv_search_documents_from_events;
4 DROP VIEW IF EXISTS moraine.mv_search_postings;
5 DROP VIEW IF EXISTS moraine.mv_search_term_stats;
6 DROP VIEW IF EXISTS moraine.mv_search_corpus_stats;
7
8 CREATE TABLE IF NOT EXISTS moraine.search_documents (
9 doc_version UInt64,
10 ingested_at DateTime64(3),
11 event_uid String,
12 compacted_parent_uid String,
13 session_id String,
14 session_date Date,
15 source_name LowCardinality(String) DEFAULT '',
16 provider LowCardinality(String) DEFAULT '',
17 source_file String,
18 source_generation UInt32,
19 source_line_no UInt64,
20 source_offset UInt64,
21 source_ref String,
22 record_ts String,
23 event_class LowCardinality(String),
24 payload_type LowCardinality(String),
25 actor_role LowCardinality(String),
26 name LowCardinality(String),
27 phase LowCardinality(String),
28 text_content String,
29 payload_json String,
30 token_usage_json String,
31 doc_len UInt32 MATERIALIZED toUInt32(length(extractAll(lowerUTF8(text_content), '[a-z0-9_]+')))
32 )
33 ENGINE = ReplacingMergeTree(doc_version)
34 PARTITION BY toYYYYMM(ingested_at)
35 ORDER BY (event_uid);
36
37 ALTER TABLE moraine.search_documents
38 ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_date;
39 ALTER TABLE moraine.search_documents
40 ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
41
42 CREATE MATERIALIZED VIEW IF NOT EXISTS moraine.mv_search_documents_from_events
43 TO moraine.search_documents
44 AS
45 SELECT
46 event_version AS doc_version,
47 ingested_at,
48 event_uid,
49 origin_event_id AS compacted_parent_uid,
50 session_id,
51 session_date,
52 source_name,
53 provider,
54 source_file,
55 source_generation,
56 source_line_no,
57 source_offset,
58 source_ref,
59 record_ts,
60 event_kind AS event_class,
61 payload_type,
62 actor_kind AS actor_role,
63 tool_name AS name,
64 if(tool_phase != '', tool_phase, op_status) AS phase,
65 text_content,
66 payload_json,
67 token_usage_json
68 FROM moraine.events
69 WHERE lengthUTF8(replaceRegexpAll(text_content, '\\s+', '')) > 0;
70
71 CREATE TABLE IF NOT EXISTS moraine.search_postings (
72 post_version UInt64,
73 term String,
74 doc_id String,
75 session_id String,
76 source_name LowCardinality(String) DEFAULT '',
77 provider LowCardinality(String) DEFAULT '',
78 event_class LowCardinality(String),
79 payload_type LowCardinality(String),
80 actor_role LowCardinality(String),
81 name LowCardinality(String),
82 phase LowCardinality(String),
83 source_ref String,
84 doc_len UInt32,
85 tf UInt16
86 )
87 ENGINE = ReplacingMergeTree(post_version)
88 PARTITION BY cityHash64(term) % 32
89 ORDER BY (term, doc_id);
90
91 ALTER TABLE moraine.search_postings
92 ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_id;
93 ALTER TABLE moraine.search_postings
94 ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
95
96 CREATE MATERIALIZED VIEW IF NOT EXISTS moraine.mv_search_postings
97 TO moraine.search_postings
98 AS
99 SELECT
100 d.doc_version AS post_version,
101 d.term,
102 d.event_uid AS doc_id,
103 d.session_id,
104 d.source_name,
105 d.provider,
106 d.event_class,
107 d.payload_type,
108 d.actor_role,
109 d.name,
110 d.phase,
111 d.source_ref,
112 d.doc_len,
113 toUInt16(count()) AS tf
114 FROM
115 (
116 SELECT
117 doc_version,
118 event_uid,
119 session_id,
120 source_name,
121 provider,
122 event_class,
123 payload_type,
124 actor_role,
125 name,
126 phase,
127 source_ref,
128 doc_len,
129 arrayJoin(extractAll(lowerUTF8(text_content), '[a-z0-9_]+')) AS term
130 FROM moraine.search_documents
131 WHERE doc_len > 0
132 ) AS d
133 WHERE lengthUTF8(d.term) BETWEEN 2 AND 64
134 GROUP BY
135 d.doc_version,
136 d.term,
137 d.event_uid,
138 d.session_id,
139 d.source_name,
140 d.provider,
141 d.event_class,
142 d.payload_type,
143 d.actor_role,
144 d.name,
145 d.phase,
146 d.source_ref,
147 d.doc_len;
148
149 DROP TABLE IF EXISTS moraine.search_term_stats;
150
151 CREATE VIEW IF NOT EXISTS moraine.search_term_stats
152 AS
153 SELECT
154 term,
155 toUInt64(count()) AS docs
156 FROM moraine.search_postings FINAL
157 GROUP BY term;
158
159 DROP TABLE IF EXISTS moraine.search_corpus_stats;
160
161 CREATE VIEW IF NOT EXISTS moraine.search_corpus_stats
162 AS
163 SELECT
164 toUInt8(0) AS bucket,
165 toUInt64(count()) AS docs,
166 toUInt64(ifNull(sum(doc_len), 0)) AS total_doc_len
167 FROM moraine.search_documents FINAL
168 WHERE doc_len > 0;
169
170 CREATE TABLE IF NOT EXISTS moraine.search_query_log (
171 ts DateTime64(3) DEFAULT now64(3),
172 query_id String,
173 source LowCardinality(String),
174 session_hint String,
175 raw_query String,
176 normalized_terms Array(String),
177 term_count UInt16,
178 result_limit UInt16,
179 min_should_match UInt16,
180 min_score Float64,
181 include_tool_events UInt8,
182 exclude_codex_mcp UInt8,
183 response_ms UInt32,
184 result_count UInt16,
185 metadata_json String
186 )
187 ENGINE = MergeTree
188 PARTITION BY toYYYYMM(ts)
189 ORDER BY (ts, query_id);
190
191 CREATE TABLE IF NOT EXISTS moraine.search_hit_log (
192 ts DateTime64(3) DEFAULT now64(3),
193 query_id String,
194 rank UInt16,
195 event_uid String,
196 session_id String,
197 source_name LowCardinality(String) DEFAULT '',
198 provider LowCardinality(String) DEFAULT '',
199 score Float64,
200 matched_terms UInt16,
201 doc_len UInt32,
202 event_class LowCardinality(String),
203 payload_type LowCardinality(String),
204 actor_role LowCardinality(String),
205 name LowCardinality(String),
206 source_ref String
207 )
208 ENGINE = MergeTree
209 PARTITION BY toYYYYMM(ts)
210 ORDER BY (query_id, rank, ts);
211
212 ALTER TABLE moraine.search_hit_log
213 ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_id;
214 ALTER TABLE moraine.search_hit_log
215 ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
216
217 CREATE TABLE IF NOT EXISTS moraine.search_interaction_log (
218 ts DateTime64(3) DEFAULT now64(3),
219 interaction_id String,
220 query_id String,
221 event_uid String,
222 action LowCardinality(String),
223 actor String,
224 value Float64,
225 note String,
226 metadata_json String
227 )
228 ENGINE = MergeTree
229 PARTITION BY toYYYYMM(ts)
230 ORDER BY (ts, query_id, event_uid, action);