Skip to content

bin/backfill-search-index


     1 #!/usr/bin/env bash
     2 set -euo pipefail
     3  
     4 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
     5 CONFIG_PATH="${MORAINE_CONFIG:-$HOME/.moraine/config.toml}"
     6 WRITE_CONFIG=0
     7  
     8 usage() {
     9   cat <<EOF
    10 usage: $(basename "$0") [--config <path>] [--write-config]
    11  
    12 options:
    13   --config <path>   config file path (default: MORAINE_CONFIG or ~/.moraine/config.toml)
    14   --write-config    explicitly allow writing a missing config by copying config/moraine.toml
    15 EOF
    16 }
    17  
    18 while [[ $# -gt 0 ]]; do
    19   case "$1" in
    20     --config)
    21       if [[ $# -lt 2 ]]; then
    22         echo "--config requires a value" >&2
    23         exit 2
    24       fi
    25       CONFIG_PATH="$2"
    26       shift 2
    27       ;;
    28     --write-config)
    29       WRITE_CONFIG=1
    30       shift
    31       ;;
    32     --help|-h)
    33       usage
    34       exit 0
    35       ;;
    36     *)
    37       echo "unknown argument: $1" >&2
    38       usage >&2
    39       exit 2
    40       ;;
    41   esac
    42 done
    43  
    44 if [[ ! -f "$CONFIG_PATH" ]]; then
    45   if [[ "$WRITE_CONFIG" -eq 1 ]]; then
    46     mkdir -p "$(dirname "$CONFIG_PATH")"
    47     cp "$PROJECT_ROOT/config/moraine.toml" "$CONFIG_PATH"
    48     echo "wrote default config to $CONFIG_PATH"
    49   else
    50     echo "config not found: $CONFIG_PATH" >&2
    51     echo "refusing to write config without explicit opt-in" >&2
    52     echo "rerun with --write-config to copy $PROJECT_ROOT/config/moraine.toml" >&2
    53     exit 1
    54   fi
    55 fi
    56  
    57 config_get() {
    58   local key="$1"
    59   "$PROJECT_ROOT/bin/moraine" --config "$CONFIG_PATH" config get "$key"
    60 }
    61  
    62 CLICKHOUSE_URL="$(config_get "clickhouse.url")"
    63 CLICKHOUSE_DB="$(config_get "clickhouse.database")"
    64 CLICKHOUSE_USER="$(config_get "clickhouse.username")"
    65 CLICKHOUSE_PASSWORD="$(config_get "clickhouse.password")"
    66 CLICKHOUSE_URL="${CLICKHOUSE_URL:-http://127.0.0.1:8123}"
    67 CLICKHOUSE_DB="${CLICKHOUSE_DB:-moraine}"
    68  
    69 CURL_AUTH_ARGS=()
    70 if [[ -n "$CLICKHOUSE_USER" ]]; then
    71   CURL_AUTH_ARGS+=(--user "${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}")
    72 fi
    73  
    74 clickhouse_curl() {
    75   curl -fsS "${CURL_AUTH_ARGS[@]}" "$@"
    76 }
    77  
    78 if ! clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%201" >/dev/null 2>&1; then
    79   echo "clickhouse is unavailable at $CLICKHOUSE_URL" >&2
    80   exit 1
    81 fi
    82  
    83 "$PROJECT_ROOT/bin/moraine" db migrate --config "$CONFIG_PATH"
    84  
    85 run_sql() {
    86   local stmt="$1"
    87   clickhouse_curl --data-binary "$stmt" "$CLICKHOUSE_URL/?database=$CLICKHOUSE_DB" >/dev/null
    88 }
    89  
    90 echo "backfilling search index tables in $CLICKHOUSE_DB"
    91  
    92 run_sql "TRUNCATE TABLE ${CLICKHOUSE_DB}.search_postings"
    93 run_sql "TRUNCATE TABLE ${CLICKHOUSE_DB}.search_documents"
    94  
    95 run_sql "INSERT INTO ${CLICKHOUSE_DB}.search_documents (doc_version, ingested_at, event_uid, compacted_parent_uid, session_id, session_date, source_name, provider, source_file, source_generation, source_line_no, source_offset, source_ref, record_ts, event_class, payload_type, actor_role, name, phase, text_content, payload_json, token_usage_json) SELECT event_version, ingested_at, event_uid, origin_event_id, session_id, session_date, source_name, provider, source_file, source_generation, source_line_no, source_offset, source_ref, record_ts, event_kind, payload_type, actor_kind, tool_name, if(tool_phase != '', tool_phase, op_status), text_content, payload_json, token_usage_json FROM ${CLICKHOUSE_DB}.events WHERE lengthUTF8(replaceRegexpAll(text_content, '\\\\s+', '')) > 0"
    96  
    97 DOCS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_documents")"
    98 POSTINGS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_postings")"
    99 TERMS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20count()%20FROM%20${CLICKHOUSE_DB}.search_term_stats")"
   100 CORPUS_DOCS="$(clickhouse_curl "$CLICKHOUSE_URL/?query=SELECT%20sum(docs)%20FROM%20${CLICKHOUSE_DB}.search_corpus_stats")"
   101  
   102 echo "search_documents: $DOCS"
   103 echo "search_postings: $POSTINGS"
   104 echo "search_term_stats (terms): $TERMS"
   105 echo "search_corpus_stats (docs): $CORPUS_DOCS"