bin/backfill-search-index
1 #!/usr/bin/env bash
2 set -euo pipefail
3
4 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
5 CONFIG_PATH="${MORAINE_CONFIG:-$HOME/.moraine/config.toml}"
6 WRITE_CONFIG=0
7
8 usage() {
9 cat <<EOF
10 usage: $(basename "$0") [--config <path>] [--write-config]
11
12 options:
13 --config <path> config file path (default: MORAINE_CONFIG or ~/.moraine/config.toml)
14 --write-config explicitly allow writing a missing config by copying config/moraine.toml
15 EOF
16 }
17
18 while [[ $# -gt 0 ]]; do
19 case "$1" in
20 --config)
21 if [[ $# -lt 2 ]]; then
22 echo "--config requires a value" >&2
23 exit 2
24 fi
25 CONFIG_PATH="$2"
26 shift 2
27 ;;
28 --write-config)
29 WRITE_CONFIG=1
30 shift
31 ;;
32 --help|-h)
33 usage
34 exit 0
35 ;;
36 *)
37 echo "unknown argument: $1" >&2
38 usage >&2
39 exit 2
40 ;;
41 esac
42 done
43
44 if [[ ! -f "$CONFIG_PATH" ]]; then
45 if [[ "$WRITE_CONFIG" -eq 1 ]]; then
46 mkdir -p "$(dirname "$CONFIG_PATH")"
47 cp "$PROJECT_ROOT/config/moraine.toml" "$CONFIG_PATH"
48 echo "wrote default config to $CONFIG_PATH"
49 else
50 echo "config not found: $CONFIG_PATH" >&2
51 echo "refusing to write config without explicit opt-in" >&2
52 echo "rerun with --write-config to copy $PROJECT_ROOT/config/moraine.toml" >&2
53 exit 1
54 fi
55 fi
56
57 config_get() {
58 local key="$1"
59 "$PROJECT_ROOT/bin/moraine" --config "$CONFIG_PATH" config get "$key"
60 }
61
62 CLICKHOUSE_URL="$(config_get "clickhouse.url")"
63 CLICKHOUSE_DB="$(config_get "clickhouse.database")"
64 CLICKHOUSE_USER="$(config_get "clickhouse.username")"
65 CLICKHOUSE_PASSWORD="$(config_get "clickhouse.password")"
66 CLICKHOUSE_URL="${CLICKHOUSE_URL:-http://127.0.0.1:8123}"
67 CLICKHOUSE_DB="${CLICKHOUSE_DB:-moraine}"
68
69 CURL_AUTH_ARGS=()
70 if [[ -n "$CLICKHOUSE_USER" ]]; then
71 CURL_AUTH_ARGS+=(--user "${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}")
72 fi
73
74 clickhouse_curl() {
75 curl -fsS "${CURL_AUTH_ARGS[@]}" "$@"
76 }
77
78 if ! clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%201" >/dev/null 2>&1; then
79 echo "clickhouse is unavailable at $CLICKHOUSE_URL" >&2
80 exit 1
81 fi
82
83 "$PROJECT_ROOT/bin/moraine" db migrate --config "$CONFIG_PATH"
84
85 run_sql() {
86 local stmt="$1"
87 clickhouse_curl --data-binary "$stmt" "$CLICKHOUSE_URL/?database=$CLICKHOUSE_DB" >/dev/null
88 }
89
90 echo "backfilling search index tables in $CLICKHOUSE_DB"
91
92 run_sql "TRUNCATE TABLE ${CLICKHOUSE_DB}.search_postings"
93 run_sql "TRUNCATE TABLE ${CLICKHOUSE_DB}.search_documents"
94
95 run_sql "INSERT INTO ${CLICKHOUSE_DB}.search_documents (doc_version, ingested_at, event_uid, compacted_parent_uid, session_id, session_date, source_name, provider, source_file, source_generation, source_line_no, source_offset, source_ref, record_ts, event_class, payload_type, actor_role, name, phase, text_content, payload_json, token_usage_json) SELECT event_version, ingested_at, event_uid, origin_event_id, session_id, session_date, source_name, provider, source_file, source_generation, source_line_no, source_offset, source_ref, record_ts, event_kind, payload_type, actor_kind, tool_name, if(tool_phase != '', tool_phase, op_status), text_content, payload_json, token_usage_json FROM ${CLICKHOUSE_DB}.events WHERE lengthUTF8(replaceRegexpAll(text_content, '\\\\s+', '')) > 0"
96
97 DOCS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_documents")"
98 POSTINGS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_postings")"
99 TERMS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_term_stats")"
100 CORPUS_DOCS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20sum(docs)%20FROM%20${CLICKHOUSE_DB}.search_corpus_stats")"
101
102 echo "search_documents: $DOCS"
103 echo "search_postings: $POSTINGS"
104 echo "search_term_stats (terms): $TERMS"
105 echo "search_corpus_stats (docs): $CORPUS_DOCS"