root/usr/src/cmd/svc/configd/backend.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * sqlite is not compatible with _FILE_OFFSET_BITS=64, but we need to
 * be able to statvfs(2) possibly large systems.  This define gives us
 * access to the transitional interfaces.  See lfcompile64(7) for how
 * _LARGEFILE64_SOURCE works.
 */
#define _LARGEFILE64_SOURCE

#include <assert.h>
#include <atomic.h>
#include <door.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <time.h>
#include <unistd.h>
#include <zone.h>
#include <libscf_priv.h>

#include "configd.h"
#include "repcache_protocol.h"

#include <sqlite.h>
#include <sqlite-misc.h>

/*
 * This file has two purposes:
 *
 * 1. It contains the database schema, and the code for setting up our backend
 *    databases, including installing said schema.
 *
 * 2. It provides a simplified interface to the SQL database library, and
 *    synchronizes MT access to the database.
 */

#define IS_VOLATILE(be)         ((be)->be_ppath != NULL)
#define MAX_FLIGHT_RECORDER_EVENTS      100

typedef enum backend_switch_results {
        BACKEND_SWITCH_FATAL =  -1,
        BACKEND_SWITCH_OK =     0,
        BACKEND_SWITCH_RO
} backend_switch_results_t;

typedef struct backend_spent {
        uint64_t bs_count;
        hrtime_t bs_time;
        hrtime_t bs_vtime;
} backend_spent_t;

typedef struct backend_totals {
        backend_spent_t bt_lock;        /* waiting for lock */
        backend_spent_t bt_exec;        /* time spent executing SQL */
} backend_totals_t;

/*
 * There are times when svcadm asks configd to move the BACKEND_TYPE_NORMAL
 * repository to volatile storage.  See backend_switch().  When the
 * repository is on volatile storage, we save the location of the permanent
 * repository in be_ppath.  We use the saved path when the time comes to
 * move the repository back.  When the repository is on permanent storage,
 * be_ppath is set to NULL.  Also see the definition of IS_VOLATILE() above
 * for testing if the repository is on volatile storage.
 */
typedef struct sqlite_backend {
        pthread_mutex_t be_lock;
        pthread_t       be_thread;      /* thread holding lock */
        struct sqlite   *be_db;
        const char      *be_path;       /* path to db */
        const char      *be_ppath;      /* saved path to persistent db when */
                                        /* backend is volatile */
        const char      *be_checkpoint; /* path to repository checkpoint */
        int             be_readonly;    /* readonly at start, and still is */
        int             be_writing;     /* held for writing */
        backend_type_t  be_type;        /* type of db */
        hrtime_t        be_lastcheck;   /* time of last read-only check */
        backend_totals_t be_totals[2];  /* one for reading, one for writing */
} sqlite_backend_t;

struct backend_tx {
        sqlite_backend_t        *bt_be;
        int                     bt_readonly;
        int                     bt_type;
        int                     bt_full;        /* SQLITE_FULL during tx */
};

#define UPDATE_TOTALS_WR(sb, writing, field, ts, vts) { \
        backend_spent_t *__bsp = &(sb)->be_totals[!!(writing)].field; \
        __bsp->bs_count++;                                              \
        __bsp->bs_time += (gethrtime() - ts);                           \
        __bsp->bs_vtime += (gethrvtime() - vts);                        \
}

#define UPDATE_TOTALS(sb, field, ts, vts) \
        UPDATE_TOTALS_WR(sb, (sb)->be_writing, field, ts, vts)

struct backend_query {
        char    *bq_buf;
        size_t  bq_size;
};

struct backend_tbl_info {
        const char *bti_name;
        const char *bti_cols;
};

struct backend_idx_info {
        const char *bxi_tbl;
        const char *bxi_idx;
        const char *bxi_cols;
};

/* Definitions for the flight recorder: */

typedef enum be_flight_type {
        BE_FLIGHT_EV_NOEVENT = 0,       /* No event yet recorded. */
        BE_FLIGHT_EV_BACKUP,            /* Information about repo. backup */
        BE_FLIGHT_EV_BACKUP_ENTER,      /* Enter */
                                        /* backend_create_backup_locked() */
        BE_FLIGHT_EV_CHECKPOINT,        /* Request to checkpoint repository */
                                        /* for boot time backup */
        BE_FLIGHT_EV_CHECKPOINT_EXISTS, /* Existing checkpoint detected on */
                                        /* restart */
        BE_FLIGHT_EV_LINGERING_FAST,    /* Use lingering fast repository */
        BE_FLIGHT_EV_NO_BACKUP,         /* Requested backup not made */
        BE_FLIGHT_EV_REPO_CREATE,       /* Main repository created */
        BE_FLIGHT_EV_RESTART,           /* This is a restart of configd */
        BE_FLIGHT_EV_SWITCH,            /* Switch repositories */
        BE_FLIGHT_EV_TRANS_RW           /* Root transitioned to read/write */
} be_flight_type_t;

typedef enum be_flight_status {
        BE_FLIGHT_ST_INFO = 0,          /* No status.  Event is informative */
        BE_FLIGHT_ST_BOOT_BACKUP,       /* Boot time backup */
        BE_FLIGHT_ST_CHECKPOINT_BACKUP, /* Backup from checkpoint */
        BE_FLIGHT_ST_CLIENT,            /* Request form client as opposed to */
                                        /* internal call */
        BE_FLIGHT_ST_DUPLICATE,         /* Backup duplicates existing one */
        BE_FLIGHT_ST_FAIL,              /* Operation failed. */
        BE_FLIGHT_ST_FAST,              /* Fast repository (tmpfs) */
        BE_FLIGHT_ST_MI_BACKUP,         /* Manifest-import backup */
        BE_FLIGHT_ST_NO_SWITCH,         /* Don't switch repositories */
        BE_FLIGHT_ST_OTHER_BACKUP,      /* Other type of backup */
        BE_FLIGHT_ST_PERMANENT,         /* Repository on permanet storage */
        BE_FLIGHT_ST_REPO_BACKUP,       /* Backup from repository */
        BE_FLIGHT_ST_RO,                /* Main repository is read-only */
        BE_FLIGHT_ST_RW,                /* Main repository is read/write */
        BE_FLIGHT_ST_SUCCESS,           /* Operation was successful */
        BE_FLIGHT_ST_SWITCH             /* Switch repository */
} be_flight_status_t;

typedef struct be_flight_event {
        be_flight_type_t        bfe_type;       /* Type of event. */
        be_flight_status_t      bfe_status;     /* Result of the event. */
        time_t                  bfe_time;       /* Time of the event. */
        uint_t                  bfe_sequence;   /* Sequence number. */
} be_flight_event_t;

static pthread_mutex_t backend_panic_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t backend_panic_cv = PTHREAD_COND_INITIALIZER;
pthread_t backend_panic_thread = 0;

int backend_do_trace = 0;               /* invoke tracing callback */
int backend_print_trace = 0;            /* tracing callback prints SQL */
int backend_panic_abort = 0;            /* abort when panicking */

/* Data for the flight_recorder. */

static pthread_mutex_t backend_flight_recorder_lock = PTHREAD_MUTEX_INITIALIZER;
static be_flight_event_t flight_recorder[MAX_FLIGHT_RECORDER_EVENTS];
static uint_t flight_recorder_next = 0;
static uint_t flight_recorder_missed = 0;
static uint_t flight_recorder_sequence = 0;

/* interval between read-only checks while starting up */
#define BACKEND_READONLY_CHECK_INTERVAL (2 * (hrtime_t)NANOSEC)

/*
 * Any incompatible change to the below schema should bump the version number.
 * The schema has been changed to support value ordering,  but this change
 * is backwards-compatible - i.e. a previous svc.configd can use a
 * repository database with the new schema perfectly well.  As a result,
 * the schema version has not been updated,  allowing downgrade of systems
 * without losing repository data.
 */
#define BACKEND_SCHEMA_VERSION          5

static struct backend_tbl_info tbls_normal[] = { /* BACKEND_TYPE_NORMAL */
        /*
         * service_tbl holds all services.  svc_id is the identifier of the
         * service.
         */
        {
                "service_tbl",
                "svc_id          INTEGER PRIMARY KEY,"
                "svc_name        CHAR(256) NOT NULL"
        },

        /*
         * instance_tbl holds all of the instances.  The parent service id
         * is instance_svc.
         */
        {
                "instance_tbl",
                "instance_id     INTEGER PRIMARY KEY,"
                "instance_name   CHAR(256) NOT NULL,"
                "instance_svc    INTEGER NOT NULL"
        },

        /*
         * snapshot_lnk_tbl links (instance, snapshot name) with snapshots.
         */
        {
                "snapshot_lnk_tbl",
                "lnk_id          INTEGER PRIMARY KEY,"
                "lnk_inst_id     INTEGER NOT NULL,"
                "lnk_snap_name   CHAR(256) NOT NULL,"
                "lnk_snap_id     INTEGER NOT NULL"
        },

        /*
         * snaplevel_tbl maps a snapshot id to a set of named, ordered
         * snaplevels.
         */
        {
                "snaplevel_tbl",
                "snap_id                 INTEGER NOT NULL,"
                "snap_level_num          INTEGER NOT NULL,"
                "snap_level_id           INTEGER NOT NULL,"
                "snap_level_service_id   INTEGER NOT NULL,"
                "snap_level_service      CHAR(256) NOT NULL,"
                "snap_level_instance_id  INTEGER NULL,"
                "snap_level_instance     CHAR(256) NULL"
        },

        /*
         * snaplevel_lnk_tbl links snaplevels to property groups.
         * snaplvl_pg_* is identical to the original property group,
         * and snaplvl_gen_id overrides the generation number.
         * The service/instance ids are as in the snaplevel.
         */
        {
                "snaplevel_lnk_tbl",
                "snaplvl_level_id INTEGER NOT NULL,"
                "snaplvl_pg_id    INTEGER NOT NULL,"
                "snaplvl_pg_name  CHAR(256) NOT NULL,"
                "snaplvl_pg_type  CHAR(256) NOT NULL,"
                "snaplvl_pg_flags INTEGER NOT NULL,"
                "snaplvl_gen_id   INTEGER NOT NULL"
        },

        { NULL, NULL }
};

static struct backend_idx_info idxs_normal[] = { /* BACKEND_TYPE_NORMAL */
        { "service_tbl",        "name", "svc_name" },
        { "instance_tbl",       "name", "instance_svc, instance_name" },
        { "snapshot_lnk_tbl",   "name", "lnk_inst_id, lnk_snap_name" },
        { "snapshot_lnk_tbl",   "snapid", "lnk_snap_id" },
        { "snaplevel_tbl",      "id",   "snap_id" },
        { "snaplevel_lnk_tbl",  "id",   "snaplvl_pg_id" },
        { "snaplevel_lnk_tbl",  "level", "snaplvl_level_id" },
        { NULL, NULL, NULL }
};

static struct backend_tbl_info tbls_np[] = { /* BACKEND_TYPE_NONPERSIST */
        { NULL, NULL }
};

static struct backend_idx_info idxs_np[] = {    /* BACKEND_TYPE_NONPERSIST */
        { NULL, NULL, NULL }
};

static struct backend_tbl_info tbls_common[] = { /* all backend types */
        /*
         * pg_tbl defines property groups.  They are associated with a single
         * service or instance.  The pg_gen_id links them with the latest
         * "edited" version of its properties.
         */
        {
                "pg_tbl",
                "pg_id           INTEGER PRIMARY KEY,"
                "pg_parent_id    INTEGER NOT NULL,"
                "pg_name         CHAR(256) NOT NULL,"
                "pg_type         CHAR(256) NOT NULL,"
                "pg_flags        INTEGER NOT NULL,"
                "pg_gen_id       INTEGER NOT NULL"
        },

        /*
         * prop_lnk_tbl links a particular pg_id and gen_id to a set of
         * (prop_name, prop_type, val_id) trios.
         */
        {
                "prop_lnk_tbl",
                "lnk_prop_id     INTEGER PRIMARY KEY,"
                "lnk_pg_id       INTEGER NOT NULL,"
                "lnk_gen_id      INTEGER NOT NULL,"
                "lnk_prop_name   CHAR(256) NOT NULL,"
                "lnk_prop_type   CHAR(2) NOT NULL,"
                "lnk_val_id      INTEGER"
        },

        /*
         * value_tbl maps a value_id to a set of values.  For any given
         * value_id, value_type is constant.  The table definition here
         * is repeated in backend_check_upgrade(),  and must be kept in-sync.
         */
        {
                "value_tbl",
                "value_id        INTEGER NOT NULL,"
                "value_type      CHAR(1) NOT NULL,"
                "value_value     VARCHAR NOT NULL,"
                "value_order     INTEGER DEFAULT 0"
        },

        /*
         * id_tbl has one row per id space
         */
        {
                "id_tbl",
                "id_name         STRING NOT NULL,"
                "id_next         INTEGER NOT NULL"
        },

        /*
         * schema_version has a single row, which contains
         * BACKEND_SCHEMA_VERSION at the time of creation.
         */
        {
                "schema_version",
                "schema_version  INTEGER"
        },
        { NULL, NULL }
};

/*
 * The indexing of value_tbl is repeated in backend_check_upgrade() and
 * must be kept in sync with the indexing specification here.
 */
static struct backend_idx_info idxs_common[] = { /* all backend types */
        { "pg_tbl",             "parent", "pg_parent_id" },
        { "pg_tbl",             "name", "pg_parent_id, pg_name" },
        { "pg_tbl",             "type", "pg_parent_id, pg_type" },
        { "prop_lnk_tbl",       "base", "lnk_pg_id, lnk_gen_id" },
        { "prop_lnk_tbl",       "val",  "lnk_val_id" },
        { "value_tbl",          "id",   "value_id" },
        { "id_tbl",             "id",   "id_name" },
        { NULL, NULL, NULL }
};

struct run_single_int_info {
        uint32_t        *rs_out;
        int             rs_result;
};

static rep_protocol_responseid_t backend_copy_repository(const char *,
    const char *, int);
static rep_protocol_responseid_t backend_do_copy(const char *, int,
    const char *, int, size_t *);

/*
 * The flight recorder keeps track of events that happen primarily while
 * the system is booting.  Once the system is up an running, one can take a
 * gcore(1) of configd and examine the events with mdb.  Since we're most
 * interested in early boot events, we stop recording events when the
 * recorder is full.
 */
static void
flight_recorder_event(be_flight_type_t type, be_flight_status_t res)
{
        be_flight_event_t *data;
        uint_t item;
        uint_t sequence;

        if (pthread_mutex_lock(&backend_flight_recorder_lock) != 0) {
                atomic_inc_uint(&flight_recorder_missed);
                return;
        }
        if (flight_recorder_next >= MAX_FLIGHT_RECORDER_EVENTS) {
                /* Hit end of the array.  No more event recording. */
                item = flight_recorder_next;
        } else {
                item = flight_recorder_next++;
                sequence = flight_recorder_sequence++;
        }
        (void) pthread_mutex_unlock(&backend_flight_recorder_lock);

        if (item >= MAX_FLIGHT_RECORDER_EVENTS) {
                /* Array is filled.  Stop recording events */
                atomic_inc_uint(&flight_recorder_missed);
                return;
        }
        data = &flight_recorder[item];
        (void) memset(data, 0, sizeof (*data));
        data->bfe_type = type;
        data->bfe_status = res;
        data->bfe_sequence = sequence;
        data->bfe_time = time(NULL);
}

/*ARGSUSED*/
static int
run_single_int_callback(void *arg, int columns, char **vals, char **names)
{
        struct run_single_int_info *info = arg;
        uint32_t val;

        char *endptr = vals[0];

        assert(info->rs_result != REP_PROTOCOL_SUCCESS);
        assert(columns == 1);

        if (vals[0] == NULL)
                return (BACKEND_CALLBACK_CONTINUE);

        errno = 0;
        val = strtoul(vals[0], &endptr, 10);
        if ((val == 0 && endptr == vals[0]) || *endptr != 0 || errno != 0)
                backend_panic("malformed integer \"%20s\"", vals[0]);

        *info->rs_out = val;
        info->rs_result = REP_PROTOCOL_SUCCESS;
        return (BACKEND_CALLBACK_CONTINUE);
}

/*ARGSUSED*/
int
backend_fail_if_seen(void *arg, int columns, char **vals, char **names)
{
        return (BACKEND_CALLBACK_ABORT);
}

/*
 * check to see if we can successfully start a transaction;  if not, the
 * filesystem is mounted read-only.
 */
static int
backend_is_readonly(struct sqlite *db, const char *path)
{
        int r;
        statvfs64_t stat;

        if (statvfs64(path, &stat) == 0 && (stat.f_flag & ST_RDONLY))
                return (SQLITE_READONLY);

        r = sqlite_exec(db,
            "BEGIN TRANSACTION; "
            "UPDATE schema_version SET schema_version = schema_version; ",
            NULL, NULL, NULL);
        (void) sqlite_exec(db, "ROLLBACK TRANSACTION", NULL, NULL, NULL);
        return (r);
}

static void
backend_trace_sql(void *arg, const char *sql)
{
        sqlite_backend_t *be = arg;

        if (backend_print_trace) {
                (void) fprintf(stderr, "%d: %s\n", be->be_type, sql);
        }
}

static sqlite_backend_t be_info[BACKEND_TYPE_TOTAL];
static sqlite_backend_t *bes[BACKEND_TYPE_TOTAL];

/*
 * For a native build,  repositories are created from scratch, so upgrade
 * is not an issue.  This variable is implicitly protected by
 * bes[BACKEND_TYPE_NORMAL]->be_lock.
 */
#ifdef NATIVE_BUILD
static boolean_t be_normal_upgraded = B_TRUE;
#else
static boolean_t be_normal_upgraded = B_FALSE;
#endif  /* NATIVE_BUILD */

/*
 * Has backend been upgraded? In nonpersistent case, answer is always
 * yes.
 */
boolean_t
backend_is_upgraded(backend_tx_t *bt)
{
        if (bt->bt_type == BACKEND_TYPE_NONPERSIST)
                return (B_TRUE);
        return (be_normal_upgraded);
}

#define BACKEND_PANIC_TIMEOUT   (50 * MILLISEC)
/*
 * backend_panic() -- some kind of database problem or corruption has been hit.
 * We attempt to quiesce the other database users -- all of the backend sql
 * entry points will call backend_panic(NULL) if a panic is in progress, as
 * will any attempt to start a transaction.
 *
 * We give threads holding a backend lock 50ms (BACKEND_PANIC_TIMEOUT) to
 * either drop the lock or call backend_panic().  If they don't respond in
 * time, we'll just exit anyway.
 */
void
backend_panic(const char *format, ...)
{
        int i;
        va_list args;
        int failed = 0;

        (void) pthread_mutex_lock(&backend_panic_lock);
        if (backend_panic_thread != 0) {
                (void) pthread_mutex_unlock(&backend_panic_lock);
                /*
                 * first, drop any backend locks we're holding, then
                 * sleep forever on the panic_cv.
                 */
                for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
                        if (bes[i] != NULL &&
                            bes[i]->be_thread == pthread_self())
                                (void) pthread_mutex_unlock(&bes[i]->be_lock);
                }
                (void) pthread_mutex_lock(&backend_panic_lock);
                for (;;)
                        (void) pthread_cond_wait(&backend_panic_cv,
                            &backend_panic_lock);
        }
        backend_panic_thread = pthread_self();
        (void) pthread_mutex_unlock(&backend_panic_lock);

        for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
                if (bes[i] != NULL && bes[i]->be_thread == pthread_self())
                        (void) pthread_mutex_unlock(&bes[i]->be_lock);
        }

        va_start(args, format);
        configd_vcritical(format, args);
        va_end(args);

        for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
                timespec_t rel;

                rel.tv_sec = 0;
                rel.tv_nsec = BACKEND_PANIC_TIMEOUT;

                if (bes[i] != NULL && bes[i]->be_thread != pthread_self()) {
                        if (pthread_mutex_reltimedlock_np(&bes[i]->be_lock,
                            &rel) != 0)
                                failed++;
                }
        }
        if (failed) {
                configd_critical("unable to quiesce database\n");
        }

        if (backend_panic_abort)
                abort();

        exit(CONFIGD_EXIT_DATABASE_BAD);
}

/*
 * Returns
 *   _SUCCESS
 *   _DONE - callback aborted query
 *   _NO_RESOURCES - out of memory (_FULL & _TOOBIG?)
 */
static int
backend_error(sqlite_backend_t *be, int error, char *errmsg)
{
        if (error == SQLITE_OK)
                return (REP_PROTOCOL_SUCCESS);

        switch (error) {
        case SQLITE_ABORT:
                free(errmsg);
                return (REP_PROTOCOL_DONE);

        case SQLITE_NOMEM:
        case SQLITE_FULL:
        case SQLITE_TOOBIG:
                free(errmsg);
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        default:
                backend_panic("%s: db error: %s", be->be_path, errmsg);
                /*NOTREACHED*/
        }
}

static void
backend_backup_cleanup(const char **out_arg, ssize_t out_sz)
{
        char **out = (char **)out_arg;

        while (out_sz-- > 0)
                free(*out++);
        free(out_arg);
}

/*
 * builds a inverse-time-sorted array of backup files.  The path is a
 * a single buffer, and the pointers look like:
 *
 *      /this/is/a/full/path/to/repository-name-YYYYMMDDHHMMSS
 *      ^pathname               ^              ^(pathname+pathlen)
 *                              basename
 *
 * dirname will either be pathname, or ".".
 *
 * Returns the number of elements in the array, 0 if there are no previous
 * backups, or -1 on error.
 */
static ssize_t
backend_backup_get_prev(char *pathname, size_t pathlen, const char ***out_arg)
{
        char b_start, b_end;
        DIR *dir;
        char **out = NULL;
        char *name, *p;
        char *dirname, *basename;
        char *pathend;
        struct dirent *ent;

        size_t count = 0;
        size_t baselen;

        /*
         * year, month, day, hour, min, sec, plus an '_'.
         */
        const size_t ndigits = 4 + 5*2 + 1;
        const size_t baroffset = 4 + 2*2;

        size_t idx;

        pathend = pathname + pathlen;
        b_end = *pathend;
        *pathend = '\0';

        basename = strrchr(pathname, '/');

        if (basename != NULL) {
                assert(pathend > pathname && basename < pathend);
                basename++;
                dirname = pathname;
        } else {
                basename = pathname;
                dirname = ".";
        }

        baselen = strlen(basename);

        /*
         * munge the string temporarily for the opendir(), then restore it.
         */
        b_start = basename[0];

        basename[0] = '\0';
        dir = opendir(dirname);
        basename[0] = b_start;          /* restore path */

        if (dir == NULL)
                goto fail;


        while ((ent = readdir(dir)) != NULL) {
                /*
                 * Must match:
                 *      basename-YYYYMMDD_HHMMSS
                 * or we ignore it.
                 */
                if (strncmp(ent->d_name, basename, baselen) != 0)
                        continue;

                name = ent->d_name;
                if (name[baselen] != '-')
                        continue;

                p = name + baselen + 1;

                for (idx = 0; idx < ndigits; idx++) {
                        char c = p[idx];
                        if (idx == baroffset && c != '_')
                                break;
                        if (idx != baroffset && (c < '0' || c > '9'))
                                break;
                }
                if (idx != ndigits || p[idx] != '\0')
                        continue;

                /*
                 * We have a match.  insertion-sort it into our list.
                 */
                name = strdup(name);
                if (name == NULL)
                        goto fail_closedir;
                p = strrchr(name, '-');

                for (idx = 0; idx < count; idx++) {
                        char *tmp = out[idx];
                        char *tp = strrchr(tmp, '-');

                        int cmp = strcmp(p, tp);
                        if (cmp == 0)
                                cmp = strcmp(name, tmp);

                        if (cmp == 0) {
                                free(name);
                                name = NULL;
                                break;
                        } else if (cmp > 0) {
                                out[idx] = name;
                                name = tmp;
                                p = tp;
                        }
                }

                if (idx == count) {
                        char **new_out = realloc(out,
                            (count + 1) * sizeof (*out));

                        if (new_out == NULL) {
                                free(name);
                                goto fail_closedir;
                        }

                        out = new_out;
                        out[count++] = name;
                } else {
                        assert(name == NULL);
                }
        }
        (void) closedir(dir);

        basename[baselen] = b_end;

        *out_arg = (const char **)out;
        return (count);

fail_closedir:
        (void) closedir(dir);
fail:
        basename[0] = b_start;
        *pathend = b_end;

        backend_backup_cleanup((const char **)out, count);

        *out_arg = NULL;
        return (-1);
}

/*
 * Copies the repository path into out, a buffer of out_len bytes,
 * removes the ".db" (or whatever) extension, and, if name is non-NULL,
 * appends "-name" to it.  If name is non-NULL, it can fail with:
 *
 *      _TRUNCATED      will not fit in buffer.
 *      _BAD_REQUEST    name is not a valid identifier
 */
static rep_protocol_responseid_t
backend_backup_base(sqlite_backend_t *be, const char *name,
    char *out, size_t out_len)
{
        char *p, *q;
        size_t len;

        /*
         * for paths of the form /path/to/foo.db, we truncate at the final
         * '.'.
         */
        (void) strlcpy(out, IS_VOLATILE(be) ? be->be_ppath : be->be_path,
            out_len);

        p = strrchr(out, '/');
        q = strrchr(out, '.');

        if (p != NULL && q != NULL && q > p)
                *q = 0;

        if (name != NULL) {
                len = strlen(out);
                assert(len < out_len);

                out += len;
                out_len -= len;

                len = strlen(name);

                /*
                 * verify that the name tag is entirely alphabetic,
                 * non-empty, and not too long.
                 */
                if (len == 0 || len >= REP_PROTOCOL_NAME_LEN ||
                    uu_check_name(name, UU_NAME_DOMAIN) < 0)
                        return (REP_PROTOCOL_FAIL_BAD_REQUEST);

                if (snprintf(out, out_len, "-%s", name) >= out_len)
                        return (REP_PROTOCOL_FAIL_TRUNCATED);
        }

        return (REP_PROTOCOL_SUCCESS);
}

/*
 * Make a checkpoint of the repository, so that we can use it for a backup
 * when the root file system becomes read/write.  We'll first copy the
 * repository into a temporary file and then rename it to
 * REPOSITORY_CHECKPOINT.  This is protection against configd crashing in
 * the middle of the copy and leaving a partial copy at
 * REPOSITORY_CHECKPOINT.  Renames are atomic.
 */
static rep_protocol_responseid_t
backend_checkpoint_repository(sqlite_backend_t *be)
{
        rep_protocol_responseid_t r;

        assert(be->be_readonly);        /* Only need a checkpoint if / is ro */
        assert(be->be_type == BACKEND_TYPE_NORMAL);
        assert(be->be_checkpoint == NULL); /* Only 1 checkpoint */

        r = backend_copy_repository(be->be_path, REPOSITORY_CHECKPOINT, 0);
        if (r == REP_PROTOCOL_SUCCESS)
                be->be_checkpoint = REPOSITORY_CHECKPOINT;

        flight_recorder_event(BE_FLIGHT_EV_CHECKPOINT,
            r == REP_PROTOCOL_SUCCESS ? BE_FLIGHT_ST_SUCCESS :
            BE_FLIGHT_ST_FAIL);

        return (r);
}

/*
 * See if a backup is needed.  We do a backup unless both files are
 * byte-for-byte identical.
 */
static int
backend_check_backup_needed(const char *rep_name, const char *backup_name)
{
        int repfd = open(rep_name, O_RDONLY);
        int fd = open(backup_name, O_RDONLY);
        struct stat s_rep, s_backup;
        int c1, c2;

        FILE *f_rep = NULL;
        FILE *f_backup = NULL;

        if (repfd < 0 || fd < 0)
                goto fail;

        if (fstat(repfd, &s_rep) < 0 || fstat(fd, &s_backup) < 0)
                goto fail;

        /*
         * if they are the same file, we need to do a backup to break the
         * hard link or symlink involved.
         */
        if (s_rep.st_ino == s_backup.st_ino && s_rep.st_dev == s_backup.st_dev)
                goto fail;

        if (s_rep.st_size != s_backup.st_size)
                goto fail;

        if ((f_rep = fdopen(repfd, "r")) == NULL ||
            (f_backup = fdopen(fd, "r")) == NULL)
                goto fail;

        do {
                c1 = getc(f_rep);
                c2 = getc(f_backup);
                if (c1 != c2)
                        goto fail;
        } while (c1 != EOF);

        if (!ferror(f_rep) && !ferror(f_backup)) {
                (void) fclose(f_rep);
                (void) fclose(f_backup);
                (void) close(repfd);
                (void) close(fd);
                return (0);
        }

fail:
        if (f_rep != NULL)
                (void) fclose(f_rep);
        if (f_backup != NULL)
                (void) fclose(f_backup);
        if (repfd >= 0)
                (void) close(repfd);
        if (fd >= 0)
                (void) close(fd);
        return (1);
}

/*
 * This interface is called to perform the actual copy
 *
 * Return:
 *      _FAIL_UNKNOWN           read/write fails
 *      _FAIL_NO_RESOURCES      out of memory
 *      _SUCCESS                copy succeeds
 */
static rep_protocol_responseid_t
backend_do_copy(const char *src, int srcfd, const char *dst,
    int dstfd, size_t *sz)
{
        char *buf;
        off_t nrd, nwr, n, r_off = 0, w_off = 0;

        if ((buf = malloc(8192)) == NULL)
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        while ((nrd = read(srcfd, buf, 8192)) != 0) {
                if (nrd < 0) {
                        if (errno == EINTR)
                                continue;

                        configd_critical(
                            "Backend copy failed: fails to read from %s "
                            "at offset %d: %s\n", src, r_off, strerror(errno));
                        free(buf);
                        return (REP_PROTOCOL_FAIL_UNKNOWN);
                }

                r_off += nrd;

                nwr = 0;
                do {
                        if ((n = write(dstfd, &buf[nwr], nrd - nwr)) < 0) {
                                if (errno == EINTR)
                                        continue;

                                configd_critical(
                                    "Backend copy failed: fails to write to %s "
                                    "at offset %d: %s\n", dst, w_off,
                                    strerror(errno));
                                free(buf);
                                return (REP_PROTOCOL_FAIL_UNKNOWN);
                        }

                        nwr += n;
                        w_off += n;

                } while (nwr < nrd);
        }

        if (sz)
                *sz = w_off;

        free(buf);
        return (REP_PROTOCOL_SUCCESS);
}

/*
 * Can return:
 *      _BAD_REQUEST            name is not valid
 *      _TRUNCATED              name is too long for current repository path
 *      _UNKNOWN                failed for unknown reason (details written to
 *                              console)
 *      _BACKEND_READONLY       backend is not writable
 *      _NO_RESOURCES           out of memory
 *      _SUCCESS                Backup completed successfully.
 */
static rep_protocol_responseid_t
backend_create_backup_locked(sqlite_backend_t *be, const char *name)
{
        const char **old_list;
        ssize_t old_sz;
        ssize_t old_max = max_repository_backups;
        ssize_t cur;
        char *finalname;
        char *finalpath;
        char *tmppath;
        int infd, outfd;
        size_t len;
        time_t now;
        struct tm now_tm;
        be_flight_status_t backup_type;
        rep_protocol_responseid_t result;
        const char *src;
        int use_checkpoint;

        if (strcmp(name, REPOSITORY_BOOT_BACKUP) == 0) {
                backup_type = BE_FLIGHT_ST_BOOT_BACKUP;
        } else if (strcmp(name, "manifest_import") ==  0) {
                backup_type = BE_FLIGHT_ST_MI_BACKUP;
        } else {
                backup_type = BE_FLIGHT_ST_OTHER_BACKUP;
        }
        flight_recorder_event(BE_FLIGHT_EV_BACKUP_ENTER, backup_type);

        if ((finalpath = malloc(PATH_MAX)) == NULL)
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        if ((tmppath = malloc(PATH_MAX)) == NULL) {
                free(finalpath);
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);
        }

        if (be->be_readonly) {
                flight_recorder_event(BE_FLIGHT_EV_NO_BACKUP, BE_FLIGHT_ST_RO);
                result = REP_PROTOCOL_FAIL_BACKEND_READONLY;
                goto out;
        }

        result = backend_backup_base(be, name, finalpath, PATH_MAX);
        if (result != REP_PROTOCOL_SUCCESS)
                goto out;

        /*
         * If this is a boot backup and if we made a checkpoint before the
         * root file system became read/write, then we should use the
         * checkpoint as the source.  Otherwise, we'll use the actual
         * repository as the source.
         */
        if (be->be_checkpoint && name &&
            strcmp(REPOSITORY_BOOT_BACKUP, name) == 0) {
                backup_type = BE_FLIGHT_ST_CHECKPOINT_BACKUP;
                use_checkpoint = 1;
                src = be->be_checkpoint;
        } else {
                backup_type = BE_FLIGHT_ST_REPO_BACKUP;
                use_checkpoint = 0;
                src = be->be_path;
        }
        flight_recorder_event(BE_FLIGHT_EV_BACKUP, backup_type);
        if (!backend_check_backup_needed(src, finalpath)) {
                /*
                 * No changes, so there is no need for a backup.
                 */
                flight_recorder_event(BE_FLIGHT_EV_NO_BACKUP,
                    BE_FLIGHT_ST_DUPLICATE);
                result = REP_PROTOCOL_SUCCESS;
                goto out;
        }

        /*
         * remember the original length, and the basename location
         */
        len = strlen(finalpath);
        finalname = strrchr(finalpath, '/');
        if (finalname != NULL)
                finalname++;
        else
                finalname = finalpath;

        (void) strlcpy(tmppath, finalpath, PATH_MAX);
        if (strlcat(tmppath, "-tmpXXXXXX", PATH_MAX) >= PATH_MAX) {
                result = REP_PROTOCOL_FAIL_TRUNCATED;
                goto out;
        }

        now = time(NULL);
        if (localtime_r(&now, &now_tm) == NULL) {
                configd_critical(
                    "\"%s\" backup failed: localtime(3C) failed: %s\n", name,
                    strerror(errno));
                result = REP_PROTOCOL_FAIL_UNKNOWN;
                goto out;
        }

        if (strftime(finalpath + len, PATH_MAX - len,
            "-%Y""%m""%d""_""%H""%M""%S", &now_tm) >= PATH_MAX - len) {
                result = REP_PROTOCOL_FAIL_TRUNCATED;
                goto out;
        }

        infd = open(src, O_RDONLY);
        if (infd < 0) {
                configd_critical("\"%s\" backup failed: opening %s: %s\n", name,
                    src, strerror(errno));
                result = REP_PROTOCOL_FAIL_UNKNOWN;
                goto out;
        }

        outfd = mkstemp(tmppath);
        if (outfd < 0) {
                configd_critical("\"%s\" backup failed: mkstemp(%s): %s\n",
                    name, tmppath, strerror(errno));
                (void) close(infd);
                result = REP_PROTOCOL_FAIL_UNKNOWN;
                goto out;
        }

        if ((result = backend_do_copy(src, infd, (const char *)tmppath,
            outfd, NULL)) != REP_PROTOCOL_SUCCESS)
                goto fail;

        /*
         * grab the old list before doing our re-name.
         */
        if (old_max > 0)
                old_sz = backend_backup_get_prev(finalpath, len, &old_list);

        if (rename(tmppath, finalpath) < 0) {
                configd_critical(
                    "\"%s\" backup failed: rename(%s, %s): %s\n",
                    name, tmppath, finalpath, strerror(errno));
                result = REP_PROTOCOL_FAIL_UNKNOWN;
                goto fail;
        }

        tmppath[len] = 0;       /* strip -XXXXXX, for reference symlink */

        (void) unlink(tmppath);
        if (symlink(finalname, tmppath) < 0) {
                configd_critical(
                    "\"%s\" backup completed, but updating "
                    "\"%s\" symlink to \"%s\" failed: %s\n",
                    name, tmppath, finalname, strerror(errno));
        }

        if (old_max > 0 && old_sz > 0) {
                /* unlink all but the first (old_max - 1) files */
                for (cur = old_max - 1; cur < old_sz; cur++) {
                        (void) strlcpy(finalname, old_list[cur],
                            PATH_MAX - (finalname - finalpath));
                        if (unlink(finalpath) < 0)
                                configd_critical(
                                    "\"%s\" backup completed, but removing old "
                                    "file \"%s\" failed: %s\n",
                                    name, finalpath, strerror(errno));
                }

                backend_backup_cleanup(old_list, old_sz);
        }

        result = REP_PROTOCOL_SUCCESS;
        flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_SUCCESS);

fail:
        (void) close(infd);
        (void) close(outfd);
        if (result != REP_PROTOCOL_SUCCESS) {
                flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_FAIL);
                (void) unlink(tmppath);
        }

out:
        /* Get rid of the checkpoint file now that we've used it. */
        if (use_checkpoint && (result == REP_PROTOCOL_SUCCESS)) {
                (void) unlink(be->be_checkpoint);
                be->be_checkpoint = NULL;
        }
        free(finalpath);
        free(tmppath);

        return (result);
}

/*
 * Check if value_tbl has been upgraded in the main database,  and
 * if not (if the value_order column is not present),  and do_upgrade is true,
 * upgrade value_tbl in repository to contain the additional value_order
 * column. The version of sqlite used means ALTER TABLE is not
 * available, so we cannot simply use "ALTER TABLE value_tbl ADD COLUMN".
 * Rather we need to create a temporary table with the additional column,
 * import the value_tbl, drop the original value_tbl, recreate the value_tbl
 * with the additional column, import the values from value_tbl_tmp,
 * reindex and finally drop value_tbl_tmp.  During boot, we wish to check
 * if the repository has been upgraded before it is writable,  so that
 * property value retrieval can use the appropriate form of the SELECT
 * statement that retrieves property values.  As a result, we need to check
 * if the repository has been upgraded prior to the point when we can
 * actually carry out the update.
 */
void
backend_check_upgrade(sqlite_backend_t *be, boolean_t do_upgrade)
{
        char *errp;
        int r;

        if (be_normal_upgraded)
                return;
        /*
         * Test if upgrade is needed. If value_order column does not exist,
         * we need to upgrade the schema.
         */
        r = sqlite_exec(be->be_db, "SELECT value_order FROM value_tbl LIMIT 1;",
            NULL, NULL, NULL);
        if (r == SQLITE_ERROR && do_upgrade) {
                /* No value_order column - needs upgrade */
                configd_info("Upgrading SMF repository format...");
                r = sqlite_exec(be->be_db,
                    "BEGIN TRANSACTION; "
                    "CREATE TABLE value_tbl_tmp ( "
                    "value_id   INTEGER NOT NULL, "
                    "value_type CHAR(1) NOT NULL, "
                    "value_value VARCHAR NOT NULL, "
                    "value_order INTEGER DEFAULT 0); "
                    "INSERT INTO value_tbl_tmp "
                    "(value_id, value_type, value_value) "
                    "SELECT value_id, value_type, value_value FROM value_tbl; "
                    "DROP TABLE value_tbl; "
                    "CREATE TABLE value_tbl( "
                    "value_id   INTEGER NOT NULL, "
                    "value_type CHAR(1) NOT NULL, "
                    "value_value VARCHAR NOT NULL, "
                    "value_order INTEGER DEFAULT 0); "
                    "INSERT INTO value_tbl SELECT * FROM value_tbl_tmp; "
                    "CREATE INDEX value_tbl_id ON value_tbl (value_id); "
                    "DROP TABLE value_tbl_tmp; "
                    "COMMIT TRANSACTION; "
                    "VACUUM; ",
                    NULL, NULL, &errp);
                if (r == SQLITE_OK) {
                        configd_info("SMF repository upgrade is complete.");
                } else {
                        backend_panic("%s: repository upgrade failed: %s",
                            be->be_path, errp);
                        /* NOTREACHED */
                }
        }
        if (r == SQLITE_OK)
                be_normal_upgraded = B_TRUE;
        else
                be_normal_upgraded = B_FALSE;
}

static int
backend_check_readonly(sqlite_backend_t *be, int writing, hrtime_t t)
{
        const char *check_path;
        char *errp;
        struct sqlite *new;
        int r;

        assert(be->be_readonly);
        assert(be == bes[BACKEND_TYPE_NORMAL]);

        /*
         * If we don't *need* to be writable, only check every once in a
         * while.
         */
        if (!writing) {
                if ((uint64_t)(t - be->be_lastcheck) <
                    BACKEND_READONLY_CHECK_INTERVAL)
                        return (REP_PROTOCOL_SUCCESS);
                be->be_lastcheck = t;
        }

        /*
         * It could be that the repository has been moved to non-persistent
         * storage for performance reasons.  In this case we need to check
         * the persistent path to see if it is writable.  The
         * non-persistent path will always be writable.
         */
        check_path = IS_VOLATILE(be) ? be->be_ppath : be->be_path;

        new = sqlite_open(check_path, 0600, &errp);
        if (new == NULL) {
                backend_panic("reopening %s: %s\n", check_path, errp);
                /*NOTREACHED*/
        }
        r = backend_is_readonly(new, check_path);

        if (r != SQLITE_OK) {
                /*
                 * The underlying storage for the permanent repository is
                 * still read-only, so we don't want to change the state or
                 * move the checkpointed backup if it exists.  On the other
                 * hand if the repository has been copied to volatile
                 * storage, we'll let our caller go ahead and write to the
                 * database.
                 */
                sqlite_close(new);
                if (writing && (IS_VOLATILE(be) == 0))
                        return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
                return (REP_PROTOCOL_SUCCESS);
        }

        /*
         * We can write!  If the repository is not on volatile storage,
         * swap the db handles.  Mark ourself as writable, upgrade the
         * repository if necessary and make a backup.
         */
        be->be_readonly = 0;
        flight_recorder_event(BE_FLIGHT_EV_TRANS_RW, BE_FLIGHT_ST_RW);
        if (IS_VOLATILE(be)) {
                /*
                 * If the repository is on volatile storage, don't switch
                 * the handles.  We'll continue to use the repository that
                 * is on tmpfs until we're told to move it back by one of
                 * our clients.  Clients, specifically manifest_import,
                 * move the repository to tmpfs for performance reasons,
                 * and that is the reason to not switch it back until we're
                 * told to do so.
                 */
                flight_recorder_event(BE_FLIGHT_EV_TRANS_RW,
                    BE_FLIGHT_ST_NO_SWITCH);
                sqlite_close(new);
        } else {
                flight_recorder_event(BE_FLIGHT_EV_TRANS_RW,
                    BE_FLIGHT_ST_SWITCH);
                sqlite_close(be->be_db);
                be->be_db = new;
        }

        if (be->be_type == BACKEND_TYPE_NORMAL)
                backend_check_upgrade(be, B_TRUE);

        if (backend_create_backup_locked(be, REPOSITORY_BOOT_BACKUP) !=
            REP_PROTOCOL_SUCCESS) {
                configd_critical(
                    "unable to create \"%s\" backup of \"%s\"\n",
                    REPOSITORY_BOOT_BACKUP, be->be_path);
        }

        return (REP_PROTOCOL_SUCCESS);
}

/*
 * If t is not BACKEND_TYPE_NORMAL, can fail with
 *   _BACKEND_ACCESS - backend does not exist
 *
 * If writing is nonzero, can also fail with
 *   _BACKEND_READONLY - backend is read-only
 */
static int
backend_lock(backend_type_t t, int writing, sqlite_backend_t **bep)
{
        sqlite_backend_t *be = NULL;
        hrtime_t ts, vts;

        *bep = NULL;

        assert(t == BACKEND_TYPE_NORMAL ||
            t == BACKEND_TYPE_NONPERSIST);

        be = bes[t];
        if (t == BACKEND_TYPE_NORMAL)
                assert(be != NULL);             /* should always be there */

        if (be == NULL)
                return (REP_PROTOCOL_FAIL_BACKEND_ACCESS);

        if (backend_panic_thread != 0)
                backend_panic(NULL);            /* don't proceed */

        ts = gethrtime();
        vts = gethrvtime();
        (void) pthread_mutex_lock(&be->be_lock);
        UPDATE_TOTALS_WR(be, writing, bt_lock, ts, vts);

        if (backend_panic_thread != 0) {
                (void) pthread_mutex_unlock(&be->be_lock);
                backend_panic(NULL);            /* don't proceed */
        }
        be->be_thread = pthread_self();

        if (be->be_readonly) {
                int r;
                assert(t == BACKEND_TYPE_NORMAL);

                r = backend_check_readonly(be, writing, ts);
                if (r != REP_PROTOCOL_SUCCESS) {
                        be->be_thread = 0;
                        (void) pthread_mutex_unlock(&be->be_lock);
                        return (r);
                }
        }

        if (backend_do_trace)
                (void) sqlite_trace(be->be_db, backend_trace_sql, be);
        else
                (void) sqlite_trace(be->be_db, NULL, NULL);

        be->be_writing = writing;
        *bep = be;
        return (REP_PROTOCOL_SUCCESS);
}

static void
backend_unlock(sqlite_backend_t *be)
{
        be->be_writing = 0;
        be->be_thread = 0;
        (void) pthread_mutex_unlock(&be->be_lock);
}

static void
backend_destroy(sqlite_backend_t *be)
{
        if (be->be_db != NULL) {
                sqlite_close(be->be_db);
                be->be_db = NULL;
        }
        be->be_thread = 0;
        (void) pthread_mutex_unlock(&be->be_lock);
        (void) pthread_mutex_destroy(&be->be_lock);
}

static void
backend_create_finish(backend_type_t backend_id, sqlite_backend_t *be)
{
        assert(MUTEX_HELD(&be->be_lock));
        assert(be == &be_info[backend_id]);

        bes[backend_id] = be;
        (void) pthread_mutex_unlock(&be->be_lock);
}

static int
backend_fd_write(int fd, const char *mess)
{
        int len = strlen(mess);
        int written;

        while (len > 0) {
                if ((written = write(fd, mess, len)) < 0)
                        return (-1);
                mess += written;
                len -= written;
        }
        return (0);
}

/*
 * Can return:
 *      _BAD_REQUEST            name is not valid
 *      _TRUNCATED              name is too long for current repository path
 *      _UNKNOWN                failed for unknown reason (details written to
 *                              console)
 *      _BACKEND_READONLY       backend is not writable
 *      _NO_RESOURCES           out of memory
 *      _SUCCESS                Backup completed successfully.
 */
rep_protocol_responseid_t
backend_create_backup(const char *name)
{
        rep_protocol_responseid_t result;
        sqlite_backend_t *be;

        flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_CLIENT);
        result = backend_lock(BACKEND_TYPE_NORMAL, 0, &be);
        assert(result == REP_PROTOCOL_SUCCESS);

        result = backend_create_backup_locked(be, name);
        backend_unlock(be);

        return (result);
}

/*
 * This function makes a copy of the repository at src, placing the copy at
 * dst.  It is used to copy a repository on permanent storage to volatile
 * storage or vice versa.  If the source file is on volatile storage, it is
 * often times desirable to delete it after the copy has been made and
 * verified.  To remove the source repository, set remove_src to 1.
 *
 * Can return:
 *
 *      REP_PROTOCOL_SUCCESS            successful copy and rename
 *      REP_PROTOCOL_FAIL_UNKNOWN       file operation error
 *      REP_PROTOCOL_FAIL_NO_RESOURCES  out of memory
 */
static rep_protocol_responseid_t
backend_copy_repository(const char *src, const char *dst, int remove_src)
{
        int srcfd, dstfd;
        char *tmppath = malloc(PATH_MAX);
        rep_protocol_responseid_t res = REP_PROTOCOL_SUCCESS;
        struct stat s_buf;
        size_t cpsz, sz;

        if (tmppath == NULL) {
                res = REP_PROTOCOL_FAIL_NO_RESOURCES;
                goto out;
        }

        /*
         * Create and open the related db files
         */
        (void) strlcpy(tmppath, dst, PATH_MAX);
        sz = strlcat(tmppath, "-XXXXXX", PATH_MAX);
        assert(sz < PATH_MAX);
        if (sz >= PATH_MAX) {
                configd_critical(
                    "Backend copy failed: strlcat %s: overflow\n", tmppath);
                abort();
        }

        if ((dstfd = mkstemp(tmppath)) < 0) {
                configd_critical("Backend copy failed: mkstemp %s: %s\n",
                    tmppath, strerror(errno));
                res = REP_PROTOCOL_FAIL_UNKNOWN;
                goto out;
        }

        if ((srcfd = open(src, O_RDONLY)) < 0) {
                configd_critical("Backend copy failed: opening %s: %s\n",
                    src, strerror(errno));
                res = REP_PROTOCOL_FAIL_UNKNOWN;
                goto errexit;
        }

        /*
         * fstat the backend before copy for sanity check.
         */
        if (fstat(srcfd, &s_buf) < 0) {
                configd_critical("Backend copy failed: fstat %s: %s\n",
                    src, strerror(errno));
                res = REP_PROTOCOL_FAIL_UNKNOWN;
                goto errexit;
        }

        if ((res = backend_do_copy(src, srcfd, dst, dstfd, &cpsz)) !=
            REP_PROTOCOL_SUCCESS)
                goto errexit;

        if (cpsz != s_buf.st_size) {
                configd_critical("Backend copy failed: incomplete copy\n");
                res = REP_PROTOCOL_FAIL_UNKNOWN;
                goto errexit;
        }

        /*
         * Rename tmppath to dst
         */
        if (rename(tmppath, dst) < 0) {
                configd_critical(
                    "Backend copy failed: rename %s to %s: %s\n",
                    tmppath, dst, strerror(errno));
                res = REP_PROTOCOL_FAIL_UNKNOWN;
        }

errexit:
        if (res != REP_PROTOCOL_SUCCESS && unlink(tmppath) < 0)
                configd_critical(
                    "Backend copy failed: remove %s: %s\n",
                    tmppath, strerror(errno));

        (void) close(srcfd);
        (void) close(dstfd);

out:
        free(tmppath);
        if (remove_src) {
                if (unlink(src) < 0)
                        configd_critical(
                            "Backend copy failed: remove %s: %s\n",
                            src, strerror(errno));
        }

        return (res);
}

/*
 * Perform sanity check on the repository.
 * Return 0 if check succeeds or -1 if fails.
 */
static int
backend_switch_check(struct sqlite *be_db, char **errp)
{
        struct run_single_int_info info;
        uint32_t val = -1UL;
        int r;

        info.rs_out = &val;
        info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

        r = sqlite_exec(be_db,
            "SELECT schema_version FROM schema_version;",
            run_single_int_callback, &info, errp);

        if (r == SQLITE_OK &&
            info.rs_result != REP_PROTOCOL_FAIL_NOT_FOUND &&
            val == BACKEND_SCHEMA_VERSION)
                return (0);
        else
                return (-1);
}

/*
 * backend_switch() implements the REP_PROTOCOL_SWITCH request from
 * clients.  First, it blocks all other clients from accessing the
 * repository by calling backend_lock to lock the repository.  It either
 * copies the repository from it's permanent storage location
 * (REPOSITORY_DB) to its fast volatile location (FAST_REPOSITORY_DB), or
 * vice versa.  dir determines the direction of the copy.
 *
 *      dir = 0 Copy from permanent location to volatile location.
 *      dir = 1 Copy from volatile location to permanent location.
 *
 * Can return:
 *      REP_PROTOCOL_SUCCESS                    successful switch
 *      REP_PROTOCOL_FAIL_BACKEND_ACCESS        backen access fails
 *      REP_PROTOCOL_FAIL_BACKEND_READONLY      backend is not writable
 *      REP_PROTOCOL_FAIL_UNKNOWN               file operation error
 *      REP_PROTOCOL_FAIL_NO_RESOURCES          out of memory
 */
rep_protocol_responseid_t
backend_switch(int dir)
{
        rep_protocol_responseid_t result;
        sqlite_backend_t *be;
        struct sqlite *new;
        char *errp;
        const char *dst;

        flight_recorder_event(BE_FLIGHT_EV_SWITCH, BE_FLIGHT_ST_CLIENT);

        /*
         * If switching back to the main repository, lock for writing.
         * Otherwise, lock for reading.
         */
        result = backend_lock(BACKEND_TYPE_NORMAL, dir ? 1 : 0,
            &be);
        if (result != REP_PROTOCOL_SUCCESS)
                return (result);

        if (dir) {
                flight_recorder_event(BE_FLIGHT_EV_SWITCH,
                    BE_FLIGHT_ST_PERMANENT);
                dst = REPOSITORY_DB;
        } else {
                flight_recorder_event(BE_FLIGHT_EV_SWITCH,
                    BE_FLIGHT_ST_FAST);
                dst = FAST_REPOSITORY_DB;
        }

        /*
         * Do the actual copy and rename
         */
        if (strcmp(be->be_path, dst) == 0) {
                flight_recorder_event(BE_FLIGHT_EV_SWITCH,
                    BE_FLIGHT_ST_DUPLICATE);
                result = REP_PROTOCOL_SUCCESS;
                goto errout;
        }

        result = backend_copy_repository(be->be_path, dst, dir);
        if (result != REP_PROTOCOL_SUCCESS) {
                goto errout;
        }

        /*
         * Do the backend sanity check and switch
         */
        new = sqlite_open(dst, 0600, &errp);
        if (new != NULL) {
                /*
                 * Sanity check
                 */
                if (backend_switch_check(new, &errp) == 0) {
                        free((char *)be->be_path);
                        be->be_path = strdup(dst);
                        if (be->be_path == NULL) {
                                configd_critical(
                                    "Backend switch failed: strdup %s: %s\n",
                                    dst, strerror(errno));
                                result = REP_PROTOCOL_FAIL_NO_RESOURCES;
                                sqlite_close(new);
                        } else {
                                sqlite_close(be->be_db);
                                be->be_db = new;
                                if (dir) {
                                        /* We're back on permanent storage. */
                                        be->be_ppath = NULL;
                                } else {
                                        /*
                                         * Repository is now on volatile
                                         * storage.  Save the location of
                                         * the persistent repository.
                                         */
                                        be->be_ppath = REPOSITORY_DB;
                                }
                        }
                } else {
                        configd_critical(
                            "Backend switch failed: integrity check %s: %s\n",
                            dst, errp);
                        result = REP_PROTOCOL_FAIL_BACKEND_ACCESS;
                }
        } else {
                configd_critical("Backend switch failed: sqlite_open %s: %s\n",
                    dst, errp);
                result = REP_PROTOCOL_FAIL_BACKEND_ACCESS;
        }

errout:
        if (result == REP_PROTOCOL_SUCCESS) {
                flight_recorder_event(BE_FLIGHT_EV_SWITCH,
                    BE_FLIGHT_ST_SUCCESS);
        } else {
                flight_recorder_event(BE_FLIGHT_EV_SWITCH, BE_FLIGHT_ST_FAIL);
        }
        backend_unlock(be);
        return (result);
}

/*
 * This routine is called to attempt the recovery of
 * the most recent valid repository if possible when configd
 * is restarted for some reasons or when system crashes
 * during the switch operation.  The repository databases
 * referenced here are indicators of successful switch
 * operations.
 */
static backend_switch_results_t
backend_switch_recovery(void)
{
        const char *fast_db = FAST_REPOSITORY_DB;
        char *errp = NULL;
        struct stat s_buf;
        struct sqlite *be_db;
        int r;
        backend_switch_results_t res = BACKEND_SWITCH_OK;

        /*
         * A good transient db containing most recent data can
         * exist if svc.configd crashes during the
         * switch operation.  If that is the case, check its
         * integrity and use it.
         */
        if (stat(fast_db, &s_buf) < 0) {
                return (BACKEND_SWITCH_OK);
        }

        /* Determine if persistent repository is read-only */
        be_db = sqlite_open(REPOSITORY_DB, 0600, &errp);
        if (be_db == NULL) {
                configd_critical("Unable to open \"%s\".  %s\n",
                    REPOSITORY_DB, errp == NULL ? "" : errp);
                free(errp);
                return (BACKEND_SWITCH_FATAL);
        }
        r = backend_is_readonly(be_db, REPOSITORY_DB);
        sqlite_close(be_db);
        if (r != SQLITE_OK) {
                if (r == SQLITE_READONLY) {
                        return (BACKEND_SWITCH_RO);
                }
                return (BACKEND_SWITCH_FATAL);
        }

        /*
         * Do sanity check on the db
         */
        be_db = sqlite_open(fast_db, 0600, &errp);

        if (be_db != NULL) {
                if (backend_switch_check(be_db, &errp) == 0) {
                        if (backend_copy_repository(fast_db,
                            REPOSITORY_DB, 1) != REP_PROTOCOL_SUCCESS) {
                                res = BACKEND_SWITCH_FATAL;
                        }
                }
                sqlite_close(be_db);
        }
        free(errp);

        /*
         * If we get to this point, the fast_db has either been copied or
         * it is useless.  Either way, get rid of it.
         */
        (void) unlink(fast_db);

        return (res);
}

/*ARGSUSED*/
static int
backend_integrity_callback(void *private, int narg, char **vals, char **cols)
{
        char **out = private;
        char *old = *out;
        char *new;
        const char *info;
        size_t len;
        int x;

        for (x = 0; x < narg; x++) {
                if ((info = vals[x]) != NULL &&
                    strcmp(info, "ok") != 0) {
                        len = (old == NULL)? 0 : strlen(old);
                        len += strlen(info) + 2;        /* '\n' + '\0' */

                        new = realloc(old, len);
                        if (new == NULL)
                                return (BACKEND_CALLBACK_ABORT);
                        if (old == NULL)
                                new[0] = 0;
                        old = *out = new;
                        (void) strlcat(new, info, len);
                        (void) strlcat(new, "\n", len);
                }
        }
        return (BACKEND_CALLBACK_CONTINUE);
}

#define BACKEND_CREATE_LOCKED           -2
#define BACKEND_CREATE_FAIL             -1
#define BACKEND_CREATE_SUCCESS          0
#define BACKEND_CREATE_READONLY         1
#define BACKEND_CREATE_NEED_INIT        2
static int
backend_create(backend_type_t backend_id, const char *db_file,
    sqlite_backend_t **bep)
{
        char *errp;
        char *integrity_results = NULL;
        sqlite_backend_t *be;
        int r;
        uint32_t val = -1UL;
        struct run_single_int_info info;
        int fd;

        assert(backend_id >= 0 && backend_id < BACKEND_TYPE_TOTAL);

        be = &be_info[backend_id];

        assert(be->be_db == NULL);

        (void) pthread_mutex_init(&be->be_lock, NULL);
        (void) pthread_mutex_lock(&be->be_lock);

        be->be_type = backend_id;
        be->be_path = strdup(db_file);
        if (be->be_path == NULL) {
                perror("malloc");
                goto fail;
        }

        be->be_db = sqlite_open(be->be_path, 0600, &errp);

        if (be->be_db == NULL) {
                if (strstr(errp, "out of memory") != NULL) {
                        configd_critical("%s: %s\n", db_file, errp);
                        free(errp);

                        goto fail;
                }

                /* report it as an integrity failure */
                integrity_results = errp;
                errp = NULL;
                goto integrity_fail;
        }

        /*
         * check if we are inited and of the correct schema version
         *
         */
        info.rs_out = &val;
        info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

        r = sqlite_exec(be->be_db, "SELECT schema_version FROM schema_version;",
            run_single_int_callback, &info, &errp);
        if (r == SQLITE_ERROR &&
            strcmp("no such table: schema_version", errp) == 0) {
                free(errp);
                /*
                 * Could be an empty repository, could be pre-schema_version
                 * schema.  Check for id_tbl, which has always been there.
                 */
                r = sqlite_exec(be->be_db, "SELECT count() FROM id_tbl;",
                    NULL, NULL, &errp);
                if (r == SQLITE_ERROR &&
                    strcmp("no such table: id_tbl", errp) == 0) {
                        free(errp);
                        *bep = be;
                        return (BACKEND_CREATE_NEED_INIT);
                }

                configd_critical("%s: schema version mismatch\n", db_file);
                goto fail;
        }
        if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
                free(errp);
                *bep = NULL;
                backend_destroy(be);
                return (BACKEND_CREATE_LOCKED);
        }
        if (r == SQLITE_OK) {
                if (info.rs_result == REP_PROTOCOL_FAIL_NOT_FOUND ||
                    val != BACKEND_SCHEMA_VERSION) {
                        configd_critical("%s: schema version mismatch\n",
                            db_file);
                        goto fail;
                }
        }

        /*
         * pull in the whole database sequentially.
         */
        if ((fd = open(db_file, O_RDONLY)) >= 0) {
                size_t sz = 64 * 1024;
                char *buffer = malloc(sz);
                if (buffer != NULL) {
                        while (read(fd, buffer, sz) > 0)
                                ;
                        free(buffer);
                }
                (void) close(fd);
        }

        /*
         * run an integrity check
         */
        r = sqlite_exec(be->be_db, "PRAGMA integrity_check;",
            backend_integrity_callback, &integrity_results, &errp);

        if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
                free(errp);
                *bep = NULL;
                backend_destroy(be);
                return (BACKEND_CREATE_LOCKED);
        }
        if (r == SQLITE_ABORT) {
                free(errp);
                errp = NULL;
                integrity_results = "out of memory running integrity check\n";
        } else if (r != SQLITE_OK && integrity_results == NULL) {
                integrity_results = errp;
                errp = NULL;
        }

integrity_fail:
        if (integrity_results != NULL) {
                const char *fname = "/etc/svc/volatile/db_errors";
                if ((fd = open(fname, O_CREAT|O_WRONLY|O_APPEND, 0600)) < 0) {
                        fname = NULL;
                } else {
                        if (backend_fd_write(fd, "\n\n") < 0 ||
                            backend_fd_write(fd, db_file) < 0 ||
                            backend_fd_write(fd,
                            ": PRAGMA integrity_check; failed.  Results:\n") <
                            0 || backend_fd_write(fd, integrity_results) < 0 ||
                            backend_fd_write(fd, "\n\n") < 0) {
                                fname = NULL;
                        }
                        (void) close(fd);
                }

                if (!is_main_repository ||
                    backend_id == BACKEND_TYPE_NONPERSIST) {
                        if (fname != NULL)
                                configd_critical(
                                    "%s: integrity check failed. Details in "
                                    "%s\n", db_file, fname);
                        else
                                configd_critical(
                                    "%s: integrity check failed.\n",
                                    db_file);
                } else {
                        (void) fprintf(stderr,
"\n"
"svc.configd: smf(7) database integrity check of:\n"
"\n"
"    %s\n"
"\n"
"  failed. The database might be damaged or a media error might have\n"
"  prevented it from being verified.  Additional information useful to\n"
"  your service provider%s%s\n"
"\n"
"  The system will not be able to boot until you have restored a working\n"
"  database.  svc.startd(8) will provide a sulogin(8) prompt for recovery\n"
"  purposes.  The command:\n"
"\n"
"    /lib/svc/bin/restore_repository\n"
"\n"
"  can be run to restore a backup version of your repository.  See\n"
"  http://illumos.org/msg/SMF-8000-MY for more information.\n"
"\n",
                            db_file,
                            (fname == NULL)? ":\n\n" : " is in:\n\n    ",
                            (fname == NULL)? integrity_results : fname);
                }
                free(errp);
                goto fail;
        }

        /*
         * Simply do check if backend has been upgraded.  We do not wish
         * to actually carry out upgrade here - the main repository may
         * not be writable at this point.  Actual upgrade is carried out
         * via backend_check_readonly().  This check is done so that
         * we determine repository state - upgraded or not - and then
         * the appropriate SELECT statement (value-ordered or not)
         * can be used when retrieving property values early in boot.
         */
        if (backend_id == BACKEND_TYPE_NORMAL)
                backend_check_upgrade(be, B_FALSE);
        /*
         * check if we are writable
         */
        r = backend_is_readonly(be->be_db, be->be_path);

        if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
                free(errp);
                *bep = NULL;
                backend_destroy(be);
                return (BACKEND_CREATE_LOCKED);
        }
        if (r != SQLITE_OK && r != SQLITE_FULL) {
                free(errp);
                be->be_readonly = 1;
                *bep = be;
                return (BACKEND_CREATE_READONLY);
        }

        *bep = be;
        return (BACKEND_CREATE_SUCCESS);

fail:
        *bep = NULL;
        backend_destroy(be);
        return (BACKEND_CREATE_FAIL);
}

/*
 * (arg & -arg) is, through the magic of twos-complement arithmetic, the
 * lowest set bit in arg.
 */
static size_t
round_up_to_p2(size_t arg)
{
        /*
         * Don't allow a zero result.
         */
        assert(arg > 0 && ((ssize_t)arg > 0));

        while ((arg & (arg - 1)) != 0)
                arg += (arg & -arg);

        return (arg);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _BACKEND_ACCESS - backend type t (other than _NORMAL) doesn't exist
 *   _DONE - callback aborted query
 *   _SUCCESS
 */
int
backend_run(backend_type_t t, backend_query_t *q,
    backend_run_callback_f *cb, void *data)
{
        char *errmsg = NULL;
        int ret;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        if (q == NULL || q->bq_buf == NULL)
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        if ((ret = backend_lock(t, 0, &be)) != REP_PROTOCOL_SUCCESS)
                return (ret);

        ts = gethrtime();
        vts = gethrvtime();
        ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        ret = backend_error(be, ret, errmsg);
        backend_unlock(be);

        return (ret);
}

/*
 * Starts a "read-only" transaction -- i.e., locks out writers as long
 * as it is active.
 *
 * Fails with
 *   _NO_RESOURCES - out of memory
 *
 * If t is not _NORMAL, can also fail with
 *   _BACKEND_ACCESS - backend does not exist
 *
 * If writable is true, can also fail with
 *   _BACKEND_READONLY
 */
static int
backend_tx_begin_common(backend_type_t t, backend_tx_t **txp, int writable)
{
        backend_tx_t *ret;
        sqlite_backend_t *be;
        int r;

        *txp = NULL;

        ret = uu_zalloc(sizeof (*ret));
        if (ret == NULL)
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        if ((r = backend_lock(t, writable, &be)) != REP_PROTOCOL_SUCCESS) {
                uu_free(ret);
                return (r);
        }

        ret->bt_be = be;
        ret->bt_readonly = !writable;
        ret->bt_type = t;
        ret->bt_full = 0;

        *txp = ret;
        return (REP_PROTOCOL_SUCCESS);
}

int
backend_tx_begin_ro(backend_type_t t, backend_tx_t **txp)
{
        return (backend_tx_begin_common(t, txp, 0));
}

static void
backend_tx_end(backend_tx_t *tx)
{
        sqlite_backend_t *be;

        be = tx->bt_be;

        if (tx->bt_full) {
                struct sqlite *new;

                /*
                 * sqlite tends to be sticky with SQLITE_FULL, so we try
                 * to get a fresh database handle if we got a FULL warning
                 * along the way.  If that fails, no harm done.
                 */
                new = sqlite_open(be->be_path, 0600, NULL);
                if (new != NULL) {
                        sqlite_close(be->be_db);
                        be->be_db = new;
                }
        }
        backend_unlock(be);
        tx->bt_be = NULL;
        uu_free(tx);
}

void
backend_tx_end_ro(backend_tx_t *tx)
{
        assert(tx->bt_readonly);
        backend_tx_end(tx);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 *   _BACKEND_ACCESS
 *   _BACKEND_READONLY
 */
int
backend_tx_begin(backend_type_t t, backend_tx_t **txp)
{
        int r;
        char *errmsg;
        hrtime_t ts, vts;

        r = backend_tx_begin_common(t, txp, 1);
        if (r != REP_PROTOCOL_SUCCESS)
                return (r);

        ts = gethrtime();
        vts = gethrvtime();
        r = sqlite_exec((*txp)->bt_be->be_db, "BEGIN TRANSACTION", NULL, NULL,
            &errmsg);
        UPDATE_TOTALS((*txp)->bt_be, bt_exec, ts, vts);
        if (r == SQLITE_FULL)
                (*txp)->bt_full = 1;
        r = backend_error((*txp)->bt_be, r, errmsg);

        if (r != REP_PROTOCOL_SUCCESS) {
                assert(r != REP_PROTOCOL_DONE);
                (void) sqlite_exec((*txp)->bt_be->be_db,
                    "ROLLBACK TRANSACTION", NULL, NULL, NULL);
                backend_tx_end(*txp);
                *txp = NULL;
                return (r);
        }

        (*txp)->bt_readonly = 0;

        return (REP_PROTOCOL_SUCCESS);
}

void
backend_tx_rollback(backend_tx_t *tx)
{
        int r;
        char *errmsg;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
        be = tx->bt_be;

        ts = gethrtime();
        vts = gethrvtime();
        r = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
            &errmsg);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (r == SQLITE_FULL)
                tx->bt_full = 1;
        (void) backend_error(be, r, errmsg);

        backend_tx_end(tx);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 */
int
backend_tx_commit(backend_tx_t *tx)
{
        int r, r2;
        char *errmsg;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
        be = tx->bt_be;
        ts = gethrtime();
        vts = gethrvtime();
        r = sqlite_exec(be->be_db, "COMMIT TRANSACTION", NULL, NULL,
            &errmsg);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (r == SQLITE_FULL)
                tx->bt_full = 1;

        r = backend_error(be, r, errmsg);
        assert(r != REP_PROTOCOL_DONE);

        if (r != REP_PROTOCOL_SUCCESS) {
                r2 = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
                    &errmsg);
                r2 = backend_error(be, r2, errmsg);
                if (r2 != REP_PROTOCOL_SUCCESS)
                        backend_panic("cannot rollback failed commit");

                backend_tx_end(tx);
                return (r);
        }
        backend_tx_end(tx);
        return (REP_PROTOCOL_SUCCESS);
}

static const char *
id_space_to_name(enum id_space id)
{
        switch (id) {
        case BACKEND_ID_SERVICE_INSTANCE:
                return ("SI");
        case BACKEND_ID_PROPERTYGRP:
                return ("PG");
        case BACKEND_ID_GENERATION:
                return ("GEN");
        case BACKEND_ID_PROPERTY:
                return ("PROP");
        case BACKEND_ID_VALUE:
                return ("VAL");
        case BACKEND_ID_SNAPNAME:
                return ("SNAME");
        case BACKEND_ID_SNAPSHOT:
                return ("SHOT");
        case BACKEND_ID_SNAPLEVEL:
                return ("SLVL");
        default:
                abort();
                /*NOTREACHED*/
        }
}

/*
 * Returns a new id or 0 if the id argument is invalid or the query fails.
 */
uint32_t
backend_new_id(backend_tx_t *tx, enum id_space id)
{
        struct run_single_int_info info;
        uint32_t new_id = 0;
        const char *name = id_space_to_name(id);
        char *errmsg;
        int ret;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
        be = tx->bt_be;

        info.rs_out = &new_id;
        info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

        ts = gethrtime();
        vts = gethrvtime();
        ret = sqlite_exec_printf(be->be_db,
            "SELECT id_next FROM id_tbl WHERE (id_name = '%q');"
            "UPDATE id_tbl SET id_next = id_next + 1 WHERE (id_name = '%q');",
            run_single_int_callback, &info, &errmsg, name, name);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (ret == SQLITE_FULL)
                tx->bt_full = 1;

        ret = backend_error(be, ret, errmsg);

        if (ret != REP_PROTOCOL_SUCCESS) {
                return (0);
        }

        return (new_id);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _DONE - callback aborted query
 *   _SUCCESS
 */
int
backend_tx_run(backend_tx_t *tx, backend_query_t *q,
    backend_run_callback_f *cb, void *data)
{
        char *errmsg = NULL;
        int ret;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL);
        be = tx->bt_be;

        if (q == NULL || q->bq_buf == NULL)
                return (REP_PROTOCOL_FAIL_NO_RESOURCES);

        ts = gethrtime();
        vts = gethrvtime();
        ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (ret == SQLITE_FULL)
                tx->bt_full = 1;
        ret = backend_error(be, ret, errmsg);

        return (ret);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _NOT_FOUND - the query returned no results
 *   _SUCCESS - the query returned a single integer
 */
int
backend_tx_run_single_int(backend_tx_t *tx, backend_query_t *q, uint32_t *buf)
{
        struct run_single_int_info info;
        int ret;

        info.rs_out = buf;
        info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

        ret = backend_tx_run(tx, q, run_single_int_callback, &info);
        assert(ret != REP_PROTOCOL_DONE);

        if (ret != REP_PROTOCOL_SUCCESS)
                return (ret);

        return (info.rs_result);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 */
int
backend_tx_run_update(backend_tx_t *tx, const char *format, ...)
{
        va_list a;
        char *errmsg;
        int ret;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
        be = tx->bt_be;

        va_start(a, format);
        ts = gethrtime();
        vts = gethrvtime();
        ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (ret == SQLITE_FULL)
                tx->bt_full = 1;
        va_end(a);
        ret = backend_error(be, ret, errmsg);
        assert(ret != REP_PROTOCOL_DONE);

        return (ret);
}

/*
 * returns REP_PROTOCOL_FAIL_NOT_FOUND if no changes occured
 */
int
backend_tx_run_update_changed(backend_tx_t *tx, const char *format, ...)
{
        va_list a;
        char *errmsg;
        int ret;
        sqlite_backend_t *be;
        hrtime_t ts, vts;

        assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
        be = tx->bt_be;

        va_start(a, format);
        ts = gethrtime();
        vts = gethrvtime();
        ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
        UPDATE_TOTALS(be, bt_exec, ts, vts);
        if (ret == SQLITE_FULL)
                tx->bt_full = 1;
        va_end(a);

        ret = backend_error(be, ret, errmsg);

        return (ret);
}

#define BACKEND_ADD_SCHEMA(be, file, tbls, idxs) \
        (backend_add_schema((be), (file), \
            (tbls), sizeof (tbls) / sizeof (*(tbls)), \
            (idxs), sizeof (idxs) / sizeof (*(idxs))))

static int
backend_add_schema(sqlite_backend_t *be, const char *file,
    struct backend_tbl_info *tbls, int tbl_count,
    struct backend_idx_info *idxs, int idx_count)
{
        int i;
        char *errmsg;
        int ret;

        /*
         * Create the tables.
         */
        for (i = 0; i < tbl_count; i++) {
                if (tbls[i].bti_name == NULL) {
                        assert(i + 1 == tbl_count);
                        break;
                }
                ret = sqlite_exec_printf(be->be_db,
                    "CREATE TABLE %s (%s);\n",
                    NULL, NULL, &errmsg, tbls[i].bti_name, tbls[i].bti_cols);

                if (ret != SQLITE_OK) {
                        configd_critical(
                            "%s: %s table creation fails: %s\n", file,
                            tbls[i].bti_name, errmsg);
                        free(errmsg);
                        return (-1);
                }
        }

        /*
         * Make indices on key tables and columns.
         */
        for (i = 0; i < idx_count; i++) {
                if (idxs[i].bxi_tbl == NULL) {
                        assert(i + 1 == idx_count);
                        break;
                }

                ret = sqlite_exec_printf(be->be_db,
                    "CREATE INDEX %s_%s ON %s (%s);\n",
                    NULL, NULL, &errmsg, idxs[i].bxi_tbl, idxs[i].bxi_idx,
                    idxs[i].bxi_tbl, idxs[i].bxi_cols);

                if (ret != SQLITE_OK) {
                        configd_critical(
                            "%s: %s_%s index creation fails: %s\n", file,
                            idxs[i].bxi_tbl, idxs[i].bxi_idx, errmsg);
                        free(errmsg);
                        return (-1);
                }
        }
        return (0);
}

static int
backend_init_schema(sqlite_backend_t *be, const char *db_file, backend_type_t t)
{
        int i;
        char *errmsg;
        int ret;

        assert(t == BACKEND_TYPE_NORMAL || t == BACKEND_TYPE_NONPERSIST);

        if (t == BACKEND_TYPE_NORMAL) {
                ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_normal, idxs_normal);
        } else if (t == BACKEND_TYPE_NONPERSIST) {
                ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_np, idxs_np);
        } else {
                abort();                /* can't happen */
        }

        if (ret < 0) {
                return (ret);
        }

        ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_common, idxs_common);
        if (ret < 0) {
                return (ret);
        }

        /*
         * Add the schema version to the table
         */
        ret = sqlite_exec_printf(be->be_db,
            "INSERT INTO schema_version (schema_version) VALUES (%d)",
            NULL, NULL, &errmsg, BACKEND_SCHEMA_VERSION);
        if (ret != SQLITE_OK) {
                configd_critical(
                    "setting schema version fails: %s\n", errmsg);
                free(errmsg);
        }

        /*
         * Populate id_tbl with initial IDs.
         */
        for (i = 0; i < BACKEND_ID_INVALID; i++) {
                const char *name = id_space_to_name(i);

                ret = sqlite_exec_printf(be->be_db,
                    "INSERT INTO id_tbl (id_name, id_next) "
                    "VALUES ('%q', %d);", NULL, NULL, &errmsg, name, 1);
                if (ret != SQLITE_OK) {
                        configd_critical(
                            "id insertion for %s fails: %s\n", name, errmsg);
                        free(errmsg);
                        return (-1);
                }
        }
        /*
         * Set the persistance of the database.  The normal database is marked
         * "synchronous", so that all writes are synchronized to stable storage
         * before proceeding.
         */
        ret = sqlite_exec_printf(be->be_db,
            "PRAGMA default_synchronous = %s; PRAGMA synchronous = %s;",
            NULL, NULL, &errmsg,
            (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF",
            (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF");
        if (ret != SQLITE_OK) {
                configd_critical("pragma setting fails: %s\n", errmsg);
                free(errmsg);
                return (-1);
        }

        return (0);
}

int
backend_init(const char *db_file, const char *npdb_file, int have_np)
{
        sqlite_backend_t *be;
        char *errp;
        struct sqlite *fast_db;
        int r;
        backend_switch_results_t switch_result = BACKEND_SWITCH_OK;
        int writable_persist = 1;

        /* set up our temporary directory */
        sqlite_temp_directory = "/etc/svc/volatile";

        if (strcmp(SQLITE_VERSION, sqlite_version) != 0) {
                configd_critical("Mismatched link!  (%s should be %s)\n",
                    sqlite_version, SQLITE_VERSION);
                return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
        }

        if (db_file == NULL)
                db_file = REPOSITORY_DB;
        if (strcmp(db_file, REPOSITORY_DB) != 0) {
                is_main_repository = 0;
        }

        /*
         * If the svc.configd crashed, there might be a leftover transient
         * database at FAST_REPOSITORY_DB,which contains useful
         * information.  Both early manifest import and late manifest
         * import use svcadm to copy the repository to FAST_REPOSITORY_DB.
         * One reason for doing this is that it improves the performance of
         * manifest import.  The other reason is that the repository may be
         * on read-only root in the case of early manifest import.
         *
         * If FAST_REPOSITORY_DB exists, it is an indication that
         * svc.configd has been restarted for some reason.  Since we have
         * no way of knowing where we are in the boot process, the safe
         * thing to do is to move the repository back to it's non-transient
         * location, REPOSITORY_DB.  This may slow manifest import
         * performance, but it avoids the problem of missing the command to
         * move the repository to permanent storage.
         *
         * There is a caveat, though.  If root is read-only, we'll need to
         * leave the repository at FAST_REPOSITORY_DB.  If root is
         * read-only, late manifest import has not yet run, so it will move
         * the repository back to permanent storage when it runs.
         */
        if (is_main_repository)
                switch_result = backend_switch_recovery();

        r = backend_create(BACKEND_TYPE_NORMAL, db_file, &be);
        switch (r) {
        case BACKEND_CREATE_FAIL:
                return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
        case BACKEND_CREATE_LOCKED:
                return (CONFIGD_EXIT_DATABASE_LOCKED);
        case BACKEND_CREATE_SUCCESS:
                break;          /* success */
        case BACKEND_CREATE_READONLY:
                writable_persist = 0;
                break;
        case BACKEND_CREATE_NEED_INIT:
                if (backend_init_schema(be, db_file, BACKEND_TYPE_NORMAL)) {
                        backend_destroy(be);
                        return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
                }
                break;
        default:
                abort();
                /*NOTREACHED*/
        }
        backend_create_finish(BACKEND_TYPE_NORMAL, be);
        flight_recorder_event(BE_FLIGHT_EV_REPO_CREATE,
            writable_persist == 1 ? BE_FLIGHT_ST_RW : BE_FLIGHT_ST_RO);
        /*
         * If there was a transient repository that could not be copied
         * back because the root file system was read-only, switch over to
         * using the transient repository.
         */
        if (switch_result == BACKEND_SWITCH_RO) {
                char *db_name_copy = NULL;

                fast_db = sqlite_open(FAST_REPOSITORY_DB, 0600, &errp);
                if (fast_db == NULL) {
                        /* Can't open fast repository.  Stick with permanent. */
                        configd_critical("Cannot open \"%s\".  %s\n",
                            FAST_REPOSITORY_DB, errp == NULL ? "" : errp);
                        free(errp);
                } else {
                        db_name_copy = strdup(FAST_REPOSITORY_DB);
                        if (db_name_copy == NULL) {
                                configd_critical("backend_init: out of "
                                    "memory.\n");
                                sqlite_close(fast_db);
                                return (CONFIGD_EXIT_INIT_FAILED);
                        } else {
                                flight_recorder_event(
                                    BE_FLIGHT_EV_LINGERING_FAST,
                                    BE_FLIGHT_ST_RO);
                                sqlite_close(be->be_db);
                                be->be_db = fast_db;
                                be->be_ppath = be->be_path;
                                be->be_path = db_name_copy;
                        }
                }
        }

        if (have_np) {
                if (npdb_file == NULL)
                        npdb_file = NONPERSIST_DB;

                r = backend_create(BACKEND_TYPE_NONPERSIST, npdb_file, &be);
                switch (r) {
                case BACKEND_CREATE_SUCCESS:
                        break;          /* success */
                case BACKEND_CREATE_FAIL:
                        return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
                case BACKEND_CREATE_LOCKED:
                        return (CONFIGD_EXIT_DATABASE_LOCKED);
                case BACKEND_CREATE_READONLY:
                        configd_critical("%s: unable to write\n", npdb_file);
                        return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
                case BACKEND_CREATE_NEED_INIT:
                        if (backend_init_schema(be, db_file,
                            BACKEND_TYPE_NONPERSIST)) {
                                backend_destroy(be);
                                return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
                        }
                        break;
                default:
                        abort();
                        /*NOTREACHED*/
                }
                backend_create_finish(BACKEND_TYPE_NONPERSIST, be);

                if (r != BACKEND_CREATE_NEED_INIT) {
                        flight_recorder_event(BE_FLIGHT_EV_RESTART,
                            BE_FLIGHT_ST_INFO);
                }

                /*
                 * If we started up with a writable filesystem, but the
                 * non-persistent database needed initialization, we are
                 * booting a non-global zone or a system with a writable
                 * root (ZFS), so do a backup.  Checking to see if the
                 * non-persistent database needed initialization also keeps
                 * us from making additional backups if configd gets
                 * restarted.
                 */
                if (r == BACKEND_CREATE_NEED_INIT && writable_persist &&
                    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
                    REP_PROTOCOL_SUCCESS) {
                        if (backend_create_backup_locked(be,
                            REPOSITORY_BOOT_BACKUP) != REP_PROTOCOL_SUCCESS) {
                                configd_critical(
                                    "unable to create \"%s\" backup of "
                                    "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
                                    be->be_path);
                        }
                        backend_unlock(be);
                }

                /*
                 * On the other hand if we started with a read-only file
                 * system and the non-persistent database needed
                 * initialization, then we need to take a checkpoint of the
                 * repository.  We grab the checkpoint now before Early
                 * Manifest Import starts modifying the repository.  Then
                 * when the file system becomes writable, the checkpoint
                 * can be used to create the boot time backup of the
                 * repository.  Checking that the non-persistent database
                 * needed initialization, keeps us from making additional
                 * checkpoints if configd gets restarted.
                 */
                if (r == BACKEND_CREATE_NEED_INIT && writable_persist == 0 &&
                    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
                    REP_PROTOCOL_SUCCESS) {
                        r = backend_checkpoint_repository(be);
                        if (r != REP_PROTOCOL_SUCCESS) {
                                configd_critical("unable to create checkpoint "
                                    "of \"%s\"\n", be->be_path);
                        }
                        backend_unlock(be);
                }

                /*
                 * If the non-persistent database did not need
                 * initialization, svc.configd has been restarted.  See if
                 * the boot time checkpoint exists.  If it does, use it to
                 * make a backup if root is writable.
                 */
                if (r != BACKEND_CREATE_NEED_INIT &&
                    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
                    REP_PROTOCOL_SUCCESS) {
                        struct stat sb;

                        if ((stat(REPOSITORY_CHECKPOINT, &sb) == 0) &&
                            (sb.st_size > 0) && (sb.st_mode & S_IFREG)) {
                                be->be_checkpoint = REPOSITORY_CHECKPOINT;
                                flight_recorder_event(
                                    BE_FLIGHT_EV_CHECKPOINT_EXISTS,
                                    BE_FLIGHT_ST_INFO);
                        }

                        /*
                         * If we have a checkpoint and root is writable,
                         * make the backup now.
                         */
                        if (be->be_checkpoint && writable_persist) {
                                if (backend_create_backup_locked(be,
                                    REPOSITORY_BOOT_BACKUP) !=
                                    REP_PROTOCOL_SUCCESS) {
                                        configd_critical(
                                            "unable to create \"%s\" backup of "
                                            "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
                                            be->be_path);
                                }
                        }
                        backend_unlock(be);
                }
        }

        /*
         * If the persistent backend is writable at this point, upgrade it.
         * This can occur in a few cases, most notably on UFS roots if
         * we are operating on the backend from another root, as is the case
         * during alternate-root BFU.
         *
         * Otherwise, upgrade will occur via backend_check_readonly() when
         * the repository is re-opened read-write.
         */
        if (writable_persist) {
                r = backend_lock(BACKEND_TYPE_NORMAL, 1, &be);
                assert(r == REP_PROTOCOL_SUCCESS);
                backend_check_upgrade(be, B_TRUE);
                backend_unlock(be);
        }

        return (CONFIGD_EXIT_OKAY);
}

/*
 * quiesce all database activity prior to exiting
 */
void
backend_fini(void)
{
        sqlite_backend_t *be_normal, *be_np;

        (void) backend_lock(BACKEND_TYPE_NORMAL, 1, &be_normal);
        (void) backend_lock(BACKEND_TYPE_NONPERSIST, 1, &be_np);
}

#define QUERY_BASE      128
backend_query_t *
backend_query_alloc(void)
{
        backend_query_t *q;
        q = calloc(1, sizeof (backend_query_t));
        if (q != NULL) {
                q->bq_size = QUERY_BASE;
                q->bq_buf = calloc(1, q->bq_size);
                if (q->bq_buf == NULL) {
                        q->bq_size = 0;
                }

        }
        return (q);
}

void
backend_query_append(backend_query_t *q, const char *value)
{
        char *alloc;
        int count;
        size_t size, old_len;

        if (q == NULL) {
                /* We'll discover the error when we try to run the query. */
                return;
        }

        while (q->bq_buf != NULL) {
                old_len = strlen(q->bq_buf);
                size = q->bq_size;
                count = strlcat(q->bq_buf, value, size);

                if (count < size)
                        break;                          /* success */

                q->bq_buf[old_len] = 0;
                size = round_up_to_p2(count + 1);

                assert(size > q->bq_size);
                alloc = realloc(q->bq_buf, size);
                if (alloc == NULL) {
                        free(q->bq_buf);
                        q->bq_buf = NULL;
                        break;                          /* can't grow */
                }

                q->bq_buf = alloc;
                q->bq_size = size;
        }
}

void
backend_query_add(backend_query_t *q, const char *format, ...)
{
        va_list args;
        char *new;

        if (q == NULL || q->bq_buf == NULL)
                return;

        va_start(args, format);
        new = sqlite_vmprintf(format, args);
        va_end(args);

        if (new == NULL) {
                free(q->bq_buf);
                q->bq_buf = NULL;
                return;
        }

        backend_query_append(q, new);

        free(new);
}

void
backend_query_free(backend_query_t *q)
{
        if (q != NULL) {
                if (q->bq_buf != NULL) {
                        free(q->bq_buf);
                }
                free(q);
        }
}