Skip to content

sql/004_search_index.sql


     1 DROP VIEW IF EXISTS moraine.mv_search_documents_from_normalized;
     2 DROP VIEW IF EXISTS moraine.mv_search_documents_from_compacted;
     3 DROP VIEW IF EXISTS moraine.mv_search_documents_from_events;
     4 DROP VIEW IF EXISTS moraine.mv_search_postings;
     5 DROP VIEW IF EXISTS moraine.mv_search_term_stats;
     6 DROP VIEW IF EXISTS moraine.mv_search_corpus_stats;
     7  
     8 CREATE TABLE IF NOT EXISTS moraine.search_documents (
     9   doc_version UInt64,
    10   ingested_at DateTime64(3),
    11   event_uid String,
    12   compacted_parent_uid String,
    13   session_id String,
    14   session_date Date,
    15   source_name LowCardinality(String) DEFAULT '',
    16   provider LowCardinality(String) DEFAULT '',
    17   source_file String,
    18   source_generation UInt32,
    19   source_line_no UInt64,
    20   source_offset UInt64,
    21   source_ref String,
    22   record_ts String,
    23   event_class LowCardinality(String),
    24   payload_type LowCardinality(String),
    25   actor_role LowCardinality(String),
    26   name LowCardinality(String),
    27   phase LowCardinality(String),
    28   text_content String,
    29   payload_json String,
    30   token_usage_json String,
    31   doc_len UInt32 MATERIALIZED toUInt32(length(extractAll(lowerUTF8(text_content), '[a-z0-9_]+')))
    32 )
    33 ENGINE = ReplacingMergeTree(doc_version)
    34 PARTITION BY toYYYYMM(ingested_at)
    35 ORDER BY (event_uid);
    36  
    37 ALTER TABLE moraine.search_documents
    38   ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_date;
    39 ALTER TABLE moraine.search_documents
    40   ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
    41  
    42 CREATE MATERIALIZED VIEW IF NOT EXISTS moraine.mv_search_documents_from_events
    43 TO moraine.search_documents
    44 AS
    45 SELECT
    46   event_version AS doc_version,
    47   ingested_at,
    48   event_uid,
    49   origin_event_id AS compacted_parent_uid,
    50   session_id,
    51   session_date,
    52   source_name,
    53   provider,
    54   source_file,
    55   source_generation,
    56   source_line_no,
    57   source_offset,
    58   source_ref,
    59   record_ts,
    60   event_kind AS event_class,
    61   payload_type,
    62   actor_kind AS actor_role,
    63   tool_name AS name,
    64   if(tool_phase != '', tool_phase, op_status) AS phase,
    65   text_content,
    66   payload_json,
    67   token_usage_json
    68 FROM moraine.events
    69 WHERE lengthUTF8(replaceRegexpAll(text_content, '\\s+', '')) > 0;
    70  
    71 CREATE TABLE IF NOT EXISTS moraine.search_postings (
    72   post_version UInt64,
    73   term String,
    74   doc_id String,
    75   session_id String,
    76   source_name LowCardinality(String) DEFAULT '',
    77   provider LowCardinality(String) DEFAULT '',
    78   event_class LowCardinality(String),
    79   payload_type LowCardinality(String),
    80   actor_role LowCardinality(String),
    81   name LowCardinality(String),
    82   phase LowCardinality(String),
    83   source_ref String,
    84   doc_len UInt32,
    85   tf UInt16
    86 )
    87 ENGINE = ReplacingMergeTree(post_version)
    88 PARTITION BY cityHash64(term) % 32
    89 ORDER BY (term, doc_id);
    90  
    91 ALTER TABLE moraine.search_postings
    92   ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_id;
    93 ALTER TABLE moraine.search_postings
    94   ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
    95  
    96 CREATE MATERIALIZED VIEW IF NOT EXISTS moraine.mv_search_postings
    97 TO moraine.search_postings
    98 AS
    99 SELECT
   100   d.doc_version AS post_version,
   101   d.term,
   102   d.event_uid AS doc_id,
   103   d.session_id,
   104   d.source_name,
   105   d.provider,
   106   d.event_class,
   107   d.payload_type,
   108   d.actor_role,
   109   d.name,
   110   d.phase,
   111   d.source_ref,
   112   d.doc_len,
   113   toUInt16(count()) AS tf
   114 FROM
   115 (
   116   SELECT
   117     doc_version,
   118     event_uid,
   119     session_id,
   120     source_name,
   121     provider,
   122     event_class,
   123     payload_type,
   124     actor_role,
   125     name,
   126     phase,
   127     source_ref,
   128     doc_len,
   129     arrayJoin(extractAll(lowerUTF8(text_content), '[a-z0-9_]+')) AS term
   130   FROM moraine.search_documents
   131   WHERE doc_len > 0
   132 ) AS d
   133 WHERE lengthUTF8(d.term) BETWEEN 2 AND 64
   134 GROUP BY
   135   d.doc_version,
   136   d.term,
   137   d.event_uid,
   138   d.session_id,
   139   d.source_name,
   140   d.provider,
   141   d.event_class,
   142   d.payload_type,
   143   d.actor_role,
   144   d.name,
   145   d.phase,
   146   d.source_ref,
   147   d.doc_len;
   148  
   149 DROP TABLE IF EXISTS moraine.search_term_stats;
   150  
   151 CREATE VIEW IF NOT EXISTS moraine.search_term_stats
   152 AS
   153 SELECT
   154   term,
   155   toUInt64(count()) AS docs
   156 FROM moraine.search_postings FINAL
   157 GROUP BY term;
   158  
   159 DROP TABLE IF EXISTS moraine.search_corpus_stats;
   160  
   161 CREATE VIEW IF NOT EXISTS moraine.search_corpus_stats
   162 AS
   163 SELECT
   164   toUInt8(0) AS bucket,
   165   toUInt64(count()) AS docs,
   166   toUInt64(ifNull(sum(doc_len), 0)) AS total_doc_len
   167 FROM moraine.search_documents FINAL
   168 WHERE doc_len > 0;
   169  
   170 CREATE TABLE IF NOT EXISTS moraine.search_query_log (
   171   ts DateTime64(3) DEFAULT now64(3),
   172   query_id String,
   173   source LowCardinality(String),
   174   session_hint String,
   175   raw_query String,
   176   normalized_terms Array(String),
   177   term_count UInt16,
   178   result_limit UInt16,
   179   min_should_match UInt16,
   180   min_score Float64,
   181   include_tool_events UInt8,
   182   exclude_codex_mcp UInt8,
   183   response_ms UInt32,
   184   result_count UInt16,
   185   metadata_json String
   186 )
   187 ENGINE = MergeTree
   188 PARTITION BY toYYYYMM(ts)
   189 ORDER BY (ts, query_id);
   190  
   191 CREATE TABLE IF NOT EXISTS moraine.search_hit_log (
   192   ts DateTime64(3) DEFAULT now64(3),
   193   query_id String,
   194   rank UInt16,
   195   event_uid String,
   196   session_id String,
   197   source_name LowCardinality(String) DEFAULT '',
   198   provider LowCardinality(String) DEFAULT '',
   199   score Float64,
   200   matched_terms UInt16,
   201   doc_len UInt32,
   202   event_class LowCardinality(String),
   203   payload_type LowCardinality(String),
   204   actor_role LowCardinality(String),
   205   name LowCardinality(String),
   206   source_ref String
   207 )
   208 ENGINE = MergeTree
   209 PARTITION BY toYYYYMM(ts)
   210 ORDER BY (query_id, rank, ts);
   211  
   212 ALTER TABLE moraine.search_hit_log
   213   ADD COLUMN IF NOT EXISTS source_name LowCardinality(String) DEFAULT '' AFTER session_id;
   214 ALTER TABLE moraine.search_hit_log
   215   ADD COLUMN IF NOT EXISTS provider LowCardinality(String) DEFAULT '' AFTER source_name;
   216  
   217 CREATE TABLE IF NOT EXISTS moraine.search_interaction_log (
   218   ts DateTime64(3) DEFAULT now64(3),
   219   interaction_id String,
   220   query_id String,
   221   event_uid String,
   222   action LowCardinality(String),
   223   actor String,
   224   value Float64,
   225   note String,
   226   metadata_json String
   227 )
   228 ENGINE = MergeTree
   229 PARTITION BY toYYYYMM(ts)
   230 ORDER BY (ts, query_id, event_uid, action);