mysql56/row0log_8cc_source.html

/*****************************************************************************


Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.


This program is free software; you can redistribute it and/or modify it under

the terms of the GNU General Public License as published by the Free Software

Foundation; version 2 of the License.


This program is distributed in the hope that it will be useful, but WITHOUT

ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.


You should have received a copy of the GNU General Public License along with

this program; if not, write to the Free Software Foundation, Inc.,

51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA


*****************************************************************************/


/**************************************************/

#include "row0log.h"


#ifdef UNIV_NONINL

#include "row0log.ic"

#endif


#include "row0row.h"

#include "row0ins.h"

#include "row0upd.h"

#include "row0merge.h"

#include "row0ext.h"

#include "data0data.h"

#include "que0que.h"

#include "handler0alter.h"


#include<map>


enum row_tab_op {

        ROW_T_INSERT = 0x41,

        ROW_T_UPDATE,

        ROW_T_DELETE

};


enum row_op {

        ROW_OP_INSERT = 0x61,

        ROW_OP_DELETE

};


#ifdef UNIV_DEBUG


# define ROW_LOG_APPLY_PRINT

#endif /* UNIV_DEBUG */


#ifdef ROW_LOG_APPLY_PRINT


static bool row_log_apply_print;

#endif /* ROW_LOG_APPLY_PRINT */


#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/


struct row_log_buf_t {

        byte*           block;

        mrec_buf_t      buf;

        ulint           blocks;

        ulint           bytes;

        ulonglong       total;

};


class row_log_table_blob_t {

public:

#ifdef UNIV_DEBUG

        row_log_table_blob_t(ulonglong offset_arg) :

                old_offset (0), free_offset (offset_arg),

                offset (BLOB_FREED) {}

#else /* UNIV_DEBUG */

        row_log_table_blob_t() :

                offset (BLOB_FREED) {}

#endif /* UNIV_DEBUG */


#ifdef UNIV_DEBUG

        void blob_free(ulonglong offset_arg)

#else /* UNIV_DEBUG */

        void blob_free()

#endif /* UNIV_DEBUG */

        {

                ut_ad(offset < offset_arg);

                ut_ad(offset != BLOB_FREED);

                ut_d(old_offset = offset);

                ut_d(free_offset = offset_arg);

                offset = BLOB_FREED;

        }

        void blob_alloc(ulonglong offset_arg) {

                ut_ad(free_offset <= offset_arg);

                ut_d(old_offset = offset);

                offset = offset_arg;

        }

        bool is_freed(ulonglong offset_arg) const {

                /* This is supposed to be the offset at the end of the

                current log record. */

                ut_ad(offset_arg > 0);

                /* We should never get anywhere close the magic value. */

                ut_ad(offset_arg < BLOB_FREED);

                return(offset_arg < offset);

        }

private:

        static const ulonglong BLOB_FREED = ~0ULL;

#ifdef UNIV_DEBUG


        ulonglong       old_offset;

        ulonglong       free_offset;

#endif /* UNIV_DEBUG */


        ulonglong       offset;

};


typedef std::map<ulint, row_log_table_blob_t> page_no_map;


struct row_log_t {

        int             fd;

        ib_mutex_t      mutex;

        page_no_map*    blobs;

        dict_table_t*   table;

        bool            same_pk;

        const dtuple_t* add_cols;

        const ulint*    col_map;

        dberr_t         error;

        trx_id_t        max_trx;

        row_log_buf_t   tail;

        row_log_buf_t   head;

        ulint           size;

};


/******************************************************/

UNIV_INTERN

void

row_log_online_op(

/*==============*/

        dict_index_t*   index,

        const dtuple_t* tuple,

        trx_id_t        trx_id)

{

        byte*           b;

        ulint           extra_size;

        ulint           size;

        ulint           mrec_size;

        ulint           avail_size;

        row_log_t*      log;


        ut_ad(dtuple_validate(tuple));

        ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)

              || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */


        if (dict_index_is_corrupted(index)) {

                return;

        }


        ut_ad(dict_index_is_online_ddl(index));


        /* Compute the size of the record. This differs from

        row_merge_buf_encode(), because here we do not encode

        extra_size+1 (and reserve 0 as the end-of-chunk marker). */


        size = rec_get_converted_size_temp(

                index, tuple->fields, tuple->n_fields, &extra_size);

        ut_ad(size >= extra_size);

        ut_ad(size <= sizeof log->tail.buf);


        mrec_size = ROW_LOG_HEADER_SIZE

                + (extra_size >= 0x80) + size

                + (trx_id ? DATA_TRX_ID_LEN : 0);


        log = index->online_log;

        mutex_enter(&log->mutex);


        if (trx_id > log->max_trx) {

                log->max_trx = trx_id;

        }


        UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);


        ut_ad(log->tail.bytes < srv_sort_buf_size);

        avail_size = srv_sort_buf_size - log->tail.bytes;


        if (mrec_size > avail_size) {

                b = log->tail.buf;

        } else {

                b = log->tail.block + log->tail.bytes;

        }


        if (trx_id != 0) {

                *b++ = ROW_OP_INSERT;

                trx_write_trx_id(b, trx_id);

                b += DATA_TRX_ID_LEN;

        } else {

                *b++ = ROW_OP_DELETE;

        }


        if (extra_size < 0x80) {

                *b++ = (byte) extra_size;

        } else {

                ut_ad(extra_size < 0x8000);

                *b++ = (byte) (0x80 | (extra_size >> 8));

                *b++ = (byte) extra_size;

        }


        rec_convert_dtuple_to_temp(

                b + extra_size, index, tuple->fields, tuple->n_fields);

        b += size;


        if (mrec_size >= avail_size) {

                const os_offset_t       byte_offset

                        = (os_offset_t) log->tail.blocks

                        * srv_sort_buf_size;

                ibool                   ret;


                if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {

                        goto write_failed;

                }


                if (mrec_size == avail_size) {

                        ut_ad(b == &log->tail.block[srv_sort_buf_size]);

                } else {

                        ut_ad(b == log->tail.buf + mrec_size);

                        memcpy(log->tail.block + log->tail.bytes,

                               log->tail.buf, avail_size);

                }

                UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);

                ret = os_file_write(

                        "(modification log)",

                        OS_FILE_FROM_FD(log->fd),

                        log->tail.block, byte_offset, srv_sort_buf_size);

                log->tail.blocks++;

                if (!ret) {

write_failed:

                        /* We set the flag directly instead of invoking

                        dict_set_corrupted_index_cache_only(index) here,

                        because the index is not "public" yet. */

                        index->type |= DICT_CORRUPT;

                }

                UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);

                memcpy(log->tail.block, log->tail.buf + avail_size,

                       mrec_size - avail_size);

                log->tail.bytes = mrec_size - avail_size;

        } else {

                log->tail.bytes += mrec_size;

                ut_ad(b == log->tail.block + log->tail.bytes);

        }


        UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);

        mutex_exit(&log->mutex);

}


/******************************************************/

UNIV_INTERN

dberr_t

row_log_table_get_error(

/*====================*/

        const dict_index_t*     index)

{

        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_online_ddl(index));

        return(index->online_log->error);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

byte*

row_log_table_open(

/*===============*/

        row_log_t*      log,

        ulint           size,

        ulint*          avail)

{

        mutex_enter(&log->mutex);


        UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);


        if (log->error != DB_SUCCESS) {

                mutex_exit(&log->mutex);

                return(NULL);

        }


        ut_ad(log->tail.bytes < srv_sort_buf_size);

        *avail = srv_sort_buf_size - log->tail.bytes;


        if (size > *avail) {

                return(log->tail.buf);

        } else {

                return(log->tail.block + log->tail.bytes);

        }

}


/******************************************************/

static __attribute__((nonnull))

void

row_log_table_close_func(

/*=====================*/

        row_log_t*      log,

#ifdef UNIV_DEBUG

        const byte*     b,

#endif /* UNIV_DEBUG */

        ulint           size,

        ulint           avail)

{

        ut_ad(mutex_own(&log->mutex));


        if (size >= avail) {

                const os_offset_t       byte_offset

                        = (os_offset_t) log->tail.blocks

                        * srv_sort_buf_size;

                ibool                   ret;


                if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {

                        goto write_failed;

                }


                if (size == avail) {

                        ut_ad(b == &log->tail.block[srv_sort_buf_size]);

                } else {

                        ut_ad(b == log->tail.buf + size);

                        memcpy(log->tail.block + log->tail.bytes,

                               log->tail.buf, avail);

                }

                UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);

                ret = os_file_write(

                        "(modification log)",

                        OS_FILE_FROM_FD(log->fd),

                        log->tail.block, byte_offset, srv_sort_buf_size);

                log->tail.blocks++;

                if (!ret) {

write_failed:

                        log->error = DB_ONLINE_LOG_TOO_BIG;

                }

                UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);

                memcpy(log->tail.block, log->tail.buf + avail, size - avail);

                log->tail.bytes = size - avail;

        } else {

                log->tail.bytes += size;

                ut_ad(b == log->tail.block + log->tail.bytes);

        }


        log->tail.total += size;

        UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);

        mutex_exit(&log->mutex);

}


#ifdef UNIV_DEBUG

# define row_log_table_close(log, b, size, avail)       \

        row_log_table_close_func(log, b, size, avail)

#else /* UNIV_DEBUG */

# define row_log_table_close(log, b, size, avail)       \

        row_log_table_close_func(log, size, avail)

#endif /* UNIV_DEBUG */


/******************************************************/

UNIV_INTERN

void

row_log_table_delete(

/*=================*/

        const rec_t*    rec,

        dict_index_t*   index,

        const ulint*    offsets,

        bool            purge,

        trx_id_t        trx_id)

{

        ulint           old_pk_extra_size;

        ulint           old_pk_size;

        ulint           ext_size = 0;

        ulint           mrec_size;

        ulint           avail_size;

        mem_heap_t*     heap            = NULL;

        const dtuple_t* old_pk;

        row_ext_t*      ext;


        ut_ad(dict_index_is_clust(index));

        ut_ad(rec_offs_validate(rec, index, offsets));

        ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));

        ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)

              || rw_lock_own(&index->lock, RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */


        if (dict_index_is_corrupted(index)

            || !dict_index_is_online_ddl(index)

            || index->online_log->error != DB_SUCCESS) {

                return;

        }


        dict_table_t* new_table = index->online_log->table;

        dict_index_t* new_index = dict_table_get_first_index(new_table);


        ut_ad(dict_index_is_clust(new_index));

        ut_ad(!dict_index_is_online_ddl(new_index));


        /* Create the tuple PRIMARY KEY, DB_TRX_ID in the new_table. */

        if (index->online_log->same_pk) {

                byte*           db_trx_id;

                dtuple_t*       tuple;

                ut_ad(new_index->n_uniq == index->n_uniq);


                /* The PRIMARY KEY and DB_TRX_ID are in the first

                fields of the record. */

                heap = mem_heap_create(

                        DATA_TRX_ID_LEN

                        + DTUPLE_EST_ALLOC(new_index->n_uniq + 1));

                old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 1);

                dict_index_copy_types(tuple, new_index, tuple->n_fields);

                dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);


                for (ulint i = 0; i < new_index->n_uniq; i++) {

                        ulint           len;

                        const void*     field   = rec_get_nth_field(

                                rec, offsets, i, &len);

                        dfield_t*       dfield  = dtuple_get_nth_field(

                                tuple, i);

                        ut_ad(len != UNIV_SQL_NULL);

                        ut_ad(!rec_offs_nth_extern(offsets, i));

                        dfield_set_data(dfield, field, len);

                }


                db_trx_id = static_cast<byte*>(

                        mem_heap_alloc(heap, DATA_TRX_ID_LEN));

                trx_write_trx_id(db_trx_id, trx_id);


                dfield_set_data(dtuple_get_nth_field(tuple, new_index->n_uniq),

                                db_trx_id, DATA_TRX_ID_LEN);

        } else {

                /* The PRIMARY KEY has changed. Translate the tuple. */

                dfield_t*       dfield;


                old_pk = row_log_table_get_pk(rec, index, offsets, &heap);


                if (!old_pk) {

                        ut_ad(index->online_log->error != DB_SUCCESS);

                        return;

                }


                /* Remove DB_ROLL_PTR. */

                ut_ad(dtuple_get_n_fields_cmp(old_pk)

                      == dict_index_get_n_unique(new_index));

                ut_ad(dtuple_get_n_fields(old_pk)

                      == dict_index_get_n_unique(new_index) + 2);

                const_cast<ulint&>(old_pk->n_fields)--;


                /* Overwrite DB_TRX_ID with the old trx_id. */

                dfield = dtuple_get_nth_field(old_pk, new_index->n_uniq);

                ut_ad(dfield_get_type(dfield)->mtype == DATA_SYS);

                ut_ad(dfield_get_type(dfield)->prtype

                      == (DATA_NOT_NULL | DATA_TRX_ID));

                ut_ad(dfield_get_len(dfield) == DATA_TRX_ID_LEN);

                dfield_dup(dfield, heap);

                trx_write_trx_id(static_cast<byte*>(dfield->data), trx_id);

        }


        ut_ad(dtuple_get_n_fields(old_pk) > 1);

        ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(

                      old_pk, old_pk->n_fields - 1)->len);

        old_pk_size = rec_get_converted_size_temp(

                new_index, old_pk->fields, old_pk->n_fields,

                &old_pk_extra_size);

        ut_ad(old_pk_extra_size < 0x100);


        mrec_size = 4 + old_pk_size;


        /* Log enough prefix of the BLOB unless both the

        old and new table are in COMPACT or REDUNDANT format,

        which store the prefix in the clustered index record. */

        if (purge && rec_offs_any_extern(offsets)

            && (dict_table_get_format(index->table) >= UNIV_FORMAT_B

                || dict_table_get_format(new_table) >= UNIV_FORMAT_B)) {


                /* Build a cache of those off-page column prefixes

                that are referenced by secondary indexes. It can be

                that none of the off-page columns are needed. */

                row_build(ROW_COPY_DATA, index, rec,

                          offsets, NULL, NULL, NULL, &ext, heap);

                if (ext) {

                        /* Log the row_ext_t, ext->ext and ext->buf */

                        ext_size = ext->n_ext * ext->max_len

                                + sizeof(*ext)

                                + ext->n_ext * sizeof(ulint)

                                + (ext->n_ext - 1) * sizeof ext->len;

                        mrec_size += ext_size;

                }

        }


        if (byte* b = row_log_table_open(index->online_log,

                                         mrec_size, &avail_size)) {

                *b++ = ROW_T_DELETE;

                *b++ = static_cast<byte>(old_pk_extra_size);


                /* Log the size of external prefix we saved */

                mach_write_to_2(b, ext_size);

                b += 2;


                rec_convert_dtuple_to_temp(

                        b + old_pk_extra_size, new_index,

                        old_pk->fields, old_pk->n_fields);


                b += old_pk_size;


                if (ext_size) {

                        ulint   cur_ext_size = sizeof(*ext)

                                + (ext->n_ext - 1) * sizeof ext->len;


                        memcpy(b, ext, cur_ext_size);

                        b += cur_ext_size;


                        /* Check if we need to col_map to adjust the column

                        number. If columns were added/removed/reordered,

                        adjust the column number. */

                        if (const ulint* col_map =

                                index->online_log->col_map) {

                                for (ulint i = 0; i < ext->n_ext; i++) {

                                        const_cast<ulint&>(ext->ext[i]) =

                                                col_map[ext->ext[i]];

                                }

                        }


                        memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext));

                        b += ext->n_ext * sizeof(*ext->ext);


                        ext_size -= cur_ext_size

                                 + ext->n_ext * sizeof(*ext->ext);

                        memcpy(b, ext->buf, ext_size);

                        b += ext_size;

                }


                row_log_table_close(

                        index->online_log, b, mrec_size, avail_size);

        }


        mem_heap_free(heap);

}


/******************************************************/

static

void

row_log_table_low_redundant(

/*========================*/

        const rec_t*            rec,

        dict_index_t*           index,

        bool                    insert,

        const dtuple_t*         old_pk,

        const dict_index_t*     new_index)

{

        ulint           old_pk_size;

        ulint           old_pk_extra_size;

        ulint           size;

        ulint           extra_size;

        ulint           mrec_size;

        ulint           avail_size;

        mem_heap_t*     heap            = NULL;

        dtuple_t*       tuple;


        ut_ad(!page_is_comp(page_align(rec)));

        ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec));

        ut_ad(dict_tf_is_valid(index->table->flags));

        ut_ad(!dict_table_is_comp(index->table));  /* redundant row format */

        ut_ad(dict_index_is_clust(new_index));


        heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields));

        tuple = dtuple_create(heap, index->n_fields);

        dict_index_copy_types(tuple, index, index->n_fields);

        dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));


        if (rec_get_1byte_offs_flag(rec)) {

                for (ulint i = 0; i < index->n_fields; i++) {

                        dfield_t*       dfield;

                        ulint           len;

                        const void*     field;


                        dfield = dtuple_get_nth_field(tuple, i);

                        field = rec_get_nth_field_old(rec, i, &len);


                        dfield_set_data(dfield, field, len);

                }

        } else {

                for (ulint i = 0; i < index->n_fields; i++) {

                        dfield_t*       dfield;

                        ulint           len;

                        const void*     field;


                        dfield = dtuple_get_nth_field(tuple, i);

                        field = rec_get_nth_field_old(rec, i, &len);


                        dfield_set_data(dfield, field, len);


                        if (rec_2_is_field_extern(rec, i)) {

                                dfield_set_ext(dfield);

                        }

                }

        }


        size = rec_get_converted_size_temp(

                index, tuple->fields, tuple->n_fields, &extra_size);


        mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);


        if (insert || index->online_log->same_pk) {

                ut_ad(!old_pk);

                old_pk_extra_size = old_pk_size = 0;

        } else {

                ut_ad(old_pk);

                ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);

                ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(

                              old_pk, old_pk->n_fields - 2)->len);

                ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(

                              old_pk, old_pk->n_fields - 1)->len);


                old_pk_size = rec_get_converted_size_temp(

                        new_index, old_pk->fields, old_pk->n_fields,

                        &old_pk_extra_size);

                ut_ad(old_pk_extra_size < 0x100);

                mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;

        }


        if (byte* b = row_log_table_open(index->online_log,

                                         mrec_size, &avail_size)) {

                *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;


                if (old_pk_size) {

                        *b++ = static_cast<byte>(old_pk_extra_size);


                        rec_convert_dtuple_to_temp(

                                b + old_pk_extra_size, new_index,

                                old_pk->fields, old_pk->n_fields);

                        b += old_pk_size;

                }


                if (extra_size < 0x80) {

                        *b++ = static_cast<byte>(extra_size);

                } else {

                        ut_ad(extra_size < 0x8000);

                        *b++ = static_cast<byte>(0x80 | (extra_size >> 8));

                        *b++ = static_cast<byte>(extra_size);

                }


                rec_convert_dtuple_to_temp(

                        b + extra_size, index, tuple->fields, tuple->n_fields);

                b += size;


                row_log_table_close(

                        index->online_log, b, mrec_size, avail_size);

        }


        mem_heap_free(heap);

}


/******************************************************/

static __attribute__((nonnull(1,2,3)))

void

row_log_table_low(

/*==============*/

        const rec_t*    rec,

        dict_index_t*   index,

        const ulint*    offsets,

        bool            insert,

        const dtuple_t* old_pk)

{

        ulint                   omit_size;

        ulint                   old_pk_size;

        ulint                   old_pk_extra_size;

        ulint                   extra_size;

        ulint                   mrec_size;

        ulint                   avail_size;

        const dict_index_t*     new_index = dict_table_get_first_index(

                index->online_log->table);

        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_clust(new_index));

        ut_ad(!dict_index_is_online_ddl(new_index));

        ut_ad(rec_offs_validate(rec, index, offsets));

        ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));

        ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)

              || rw_lock_own(&index->lock, RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);

        ut_ad(page_is_leaf(page_align(rec)));

        ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets));


        if (dict_index_is_corrupted(index)

            || !dict_index_is_online_ddl(index)

            || index->online_log->error != DB_SUCCESS) {

                return;

        }


        if (!rec_offs_comp(offsets)) {

                row_log_table_low_redundant(

                        rec, index, insert, old_pk, new_index);

                return;

        }


        ut_ad(page_is_comp(page_align(rec)));

        ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);


        omit_size = REC_N_NEW_EXTRA_BYTES;


        extra_size = rec_offs_extra_size(offsets) - omit_size;


        mrec_size = ROW_LOG_HEADER_SIZE

                + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size;


        if (insert || index->online_log->same_pk) {

                ut_ad(!old_pk);

                old_pk_extra_size = old_pk_size = 0;

        } else {

                ut_ad(old_pk);

                ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);

                ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(

                              old_pk, old_pk->n_fields - 2)->len);

                ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(

                              old_pk, old_pk->n_fields - 1)->len);


                old_pk_size = rec_get_converted_size_temp(

                        new_index, old_pk->fields, old_pk->n_fields,

                        &old_pk_extra_size);

                ut_ad(old_pk_extra_size < 0x100);

                mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;

        }


        if (byte* b = row_log_table_open(index->online_log,

                                         mrec_size, &avail_size)) {

                *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;


                if (old_pk_size) {

                        *b++ = static_cast<byte>(old_pk_extra_size);


                        rec_convert_dtuple_to_temp(

                                b + old_pk_extra_size, new_index,

                                old_pk->fields, old_pk->n_fields);

                        b += old_pk_size;

                }


                if (extra_size < 0x80) {

                        *b++ = static_cast<byte>(extra_size);

                } else {

                        ut_ad(extra_size < 0x8000);

                        *b++ = static_cast<byte>(0x80 | (extra_size >> 8));

                        *b++ = static_cast<byte>(extra_size);

                }


                memcpy(b, rec - rec_offs_extra_size(offsets), extra_size);

                b += extra_size;

                memcpy(b, rec, rec_offs_data_size(offsets));

                b += rec_offs_data_size(offsets);


                row_log_table_close(

                        index->online_log, b, mrec_size, avail_size);

        }

}


/******************************************************/

UNIV_INTERN

void

row_log_table_update(

/*=================*/

        const rec_t*    rec,

        dict_index_t*   index,

        const ulint*    offsets,

        const dtuple_t* old_pk)

{

        row_log_table_low(rec, index, offsets, false, old_pk);

}


static

const dict_col_t*

row_log_table_get_pk_old_col(

/*=========================*/

        const dict_table_t*     table,

        const ulint*            col_map,

        ulint                   col_no)

{

        for (ulint i = 0; i < table->n_cols; i++) {

                if (col_no == col_map[i]) {

                        return(dict_table_get_nth_col(table, i));

                }

        }


        return(NULL);

}


static

dberr_t

row_log_table_get_pk_col(

/*=====================*/

        const dict_col_t*       col,

        const dict_field_t*     ifield,

        dfield_t*               dfield,

        mem_heap_t*             heap,

        const rec_t*            rec,

        const ulint*            offsets,

        ulint                   i,

        ulint                   zip_size,

        ulint                   max_len)

{

        const byte*     field;

        ulint           len;


        ut_ad(ut_is_2pow(zip_size));


        field = rec_get_nth_field(rec, offsets, i, &len);


        if (len == UNIV_SQL_NULL) {

                return(DB_INVALID_NULL);

        }


        if (rec_offs_nth_extern(offsets, i)) {

                ulint   field_len = ifield->prefix_len;

                byte*   blob_field;


                if (!field_len) {

                        field_len = ifield->fixed_len;

                        if (!field_len) {

                                field_len = max_len + 1;

                        }

                }


                blob_field = static_cast<byte*>(

                        mem_heap_alloc(heap, field_len));


                len = btr_copy_externally_stored_field_prefix(

                        blob_field, field_len, zip_size, field, len);

                if (len >= max_len + 1) {

                        return(DB_TOO_BIG_INDEX_COL);

                }


                dfield_set_data(dfield, blob_field, len);

        } else {

                dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);

        }


        return(DB_SUCCESS);

}


/******************************************************/

UNIV_INTERN

const dtuple_t*

row_log_table_get_pk(

/*=================*/

        const rec_t*    rec,

        dict_index_t*   index,

        const ulint*    offsets,

        mem_heap_t**    heap)

{

        dtuple_t*       tuple   = NULL;

        row_log_t*      log     = index->online_log;


        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_online_ddl(index));

        ut_ad(!offsets || rec_offs_validate(rec, index, offsets));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)

              || rw_lock_own(&index->lock, RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */


        ut_ad(log);

        ut_ad(log->table);


        if (log->same_pk) {

                /* The PRIMARY KEY columns are unchanged. */

                return(NULL);

        }


        mutex_enter(&log->mutex);


        /* log->error is protected by log->mutex. */

        if (log->error == DB_SUCCESS) {

                dict_table_t*   new_table       = log->table;

                dict_index_t*   new_index

                        = dict_table_get_first_index(new_table);

                const ulint     new_n_uniq

                        = dict_index_get_n_unique(new_index);


                if (!*heap) {

                        ulint   size = 0;


                        if (!offsets) {

                                size += (1 + REC_OFFS_HEADER_SIZE

                                         + index->n_fields)

                                        * sizeof *offsets;

                        }


                        for (ulint i = 0; i < new_n_uniq; i++) {

                                size += dict_col_get_min_size(

                                        dict_index_get_nth_col(new_index, i));

                        }


                        *heap = mem_heap_create(

                                DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);

                }


                if (!offsets) {

                        offsets = rec_get_offsets(rec, index, NULL,

                                                  ULINT_UNDEFINED, heap);

                }


                tuple = dtuple_create(*heap, new_n_uniq + 2);

                dict_index_copy_types(tuple, new_index, tuple->n_fields);

                dtuple_set_n_fields_cmp(tuple, new_n_uniq);


                const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);

                const ulint zip_size = dict_table_zip_size(index->table);


                for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {

                        dict_field_t*   ifield;

                        dfield_t*       dfield;

                        ulint           prtype;

                        ulint           mbminmaxlen;


                        ifield = dict_index_get_nth_field(new_index, new_i);

                        dfield = dtuple_get_nth_field(tuple, new_i);


                        const ulint     col_no

                                = dict_field_get_col(ifield)->ind;


                        if (const dict_col_t* col

                            = row_log_table_get_pk_old_col(

                                    index->table, log->col_map, col_no)) {

                                ulint   i = dict_col_get_clust_pos(col, index);


                                if (i == ULINT_UNDEFINED) {

                                        ut_ad(0);

                                        log->error = DB_CORRUPTION;

                                        goto err_exit;

                                }


                                log->error = row_log_table_get_pk_col(

                                        col, ifield, dfield, *heap,

                                        rec, offsets, i, zip_size, max_len);


                                if (log->error != DB_SUCCESS) {

err_exit:

                                        tuple = NULL;

                                        goto func_exit;

                                }


                                mbminmaxlen = col->mbminmaxlen;

                                prtype = col->prtype;

                        } else {

                                /* No matching column was found in the old

                                table, so this must be an added column.

                                Copy the default value. */

                                ut_ad(log->add_cols);


                                dfield_copy(dfield, dtuple_get_nth_field(

                                                    log->add_cols, col_no));

                                mbminmaxlen = dfield->type.mbminmaxlen;

                                prtype = dfield->type.prtype;

                        }


                        ut_ad(!dfield_is_ext(dfield));

                        ut_ad(!dfield_is_null(dfield));


                        if (ifield->prefix_len) {

                                ulint   len = dtype_get_at_most_n_mbchars(

                                        prtype, mbminmaxlen,

                                        ifield->prefix_len,

                                        dfield_get_len(dfield),

                                        static_cast<const char*>(

                                                dfield_get_data(dfield)));


                                ut_ad(len <= dfield_get_len(dfield));

                                dfield_set_len(dfield, len);

                        }

                }


                const byte* trx_roll = rec

                        + row_get_trx_id_offset(index, offsets);


                dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),

                                trx_roll, DATA_TRX_ID_LEN);

                dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),

                                trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);

        }


func_exit:

        mutex_exit(&log->mutex);

        return(tuple);

}


/******************************************************/

UNIV_INTERN

void

row_log_table_insert(

/*=================*/

        const rec_t*    rec,

        dict_index_t*   index,

        const ulint*    offsets)

{

        row_log_table_low(rec, index, offsets, true, NULL);

}


/******************************************************/

UNIV_INTERN

void

row_log_table_blob_free(

/*====================*/

        dict_index_t*   index,

        ulint           page_no)

{

        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_online_ddl(index));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(page_no != FIL_NULL);


        if (index->online_log->error != DB_SUCCESS) {

                return;

        }


        page_no_map*    blobs   = index->online_log->blobs;


        if (!blobs) {

                index->online_log->blobs = blobs = new page_no_map();

        }


#ifdef UNIV_DEBUG

        const ulonglong log_pos = index->online_log->tail.total;

#else

# define log_pos /* empty */

#endif /* UNIV_DEBUG */


        const page_no_map::value_type v(page_no,

                                        row_log_table_blob_t(log_pos));


        std::pair<page_no_map::iterator,bool> p = blobs->insert(v);


        if (!p.second) {

                /* Update the existing mapping. */

                ut_ad(p.first->first == page_no);

                p.first->second.blob_free(log_pos);

        }

#undef log_pos

}


/******************************************************/

UNIV_INTERN

void

row_log_table_blob_alloc(

/*=====================*/

        dict_index_t*   index,

        ulint           page_no)

{

        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_online_ddl(index));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(page_no != FIL_NULL);


        if (index->online_log->error != DB_SUCCESS) {

                return;

        }


        /* Only track allocations if the same page has been freed

        earlier. Double allocation without a free is not allowed. */

        if (page_no_map* blobs = index->online_log->blobs) {

                page_no_map::iterator p = blobs->find(page_no);


                if (p != blobs->end()) {

                        ut_ad(p->first == page_no);

                        p->second.blob_alloc(index->online_log->tail.total);

                }

        }

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

const dtuple_t*

row_log_table_apply_convert_mrec(

/*=============================*/

        const mrec_t*           mrec,

        dict_index_t*           index,

        const ulint*            offsets,

        const row_log_t*        log,

        mem_heap_t*             heap,

        trx_id_t                trx_id,

        dberr_t*                error)

{

        dtuple_t*       row;


        /* This is based on row_build(). */

        if (log->add_cols) {

                row = dtuple_copy(log->add_cols, heap);

                /* dict_table_copy_types() would set the fields to NULL */

                for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {

                        dict_col_copy_type(

                                dict_table_get_nth_col(log->table, i),

                                dfield_get_type(dtuple_get_nth_field(row, i)));

                }

        } else {

                row = dtuple_create(heap, dict_table_get_n_cols(log->table));

                dict_table_copy_types(row, log->table);

        }


        for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {

                const dict_field_t*     ind_field

                        = dict_index_get_nth_field(index, i);


                if (ind_field->prefix_len) {

                        /* Column prefixes can only occur in key

                        fields, which cannot be stored externally. For

                        a column prefix, there should also be the full

                        field in the clustered index tuple. The row

                        tuple comprises full fields, not prefixes. */

                        ut_ad(!rec_offs_nth_extern(offsets, i));

                        continue;

                }


                const dict_col_t*       col

                        = dict_field_get_col(ind_field);

                ulint                   col_no

                        = log->col_map[dict_col_get_no(col)];


                if (col_no == ULINT_UNDEFINED) {

                        /* dropped column */

                        continue;

                }


                dfield_t*               dfield

                        = dtuple_get_nth_field(row, col_no);

                ulint                   len;

                const byte*             data= NULL;


                if (rec_offs_nth_extern(offsets, i)) {

                        ut_ad(rec_offs_any_extern(offsets));

                        rw_lock_x_lock(dict_index_get_lock(index));


                        if (const page_no_map* blobs = log->blobs) {

                                data = rec_get_nth_field(

                                        mrec, offsets, i, &len);

                                ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);


                                ulint   page_no = mach_read_from_4(

                                        data + len - (BTR_EXTERN_FIELD_REF_SIZE

                                                      - BTR_EXTERN_PAGE_NO));

                                page_no_map::const_iterator p = blobs->find(

                                        page_no);

                                if (p != blobs->end()

                                    && p->second.is_freed(log->head.total)) {

                                        /* This BLOB has been freed.

                                        We must not access the row. */

                                        row = NULL;

                                }

                        }


                        if (row) {

                                data = btr_rec_copy_externally_stored_field(

                                        mrec, offsets,

                                        dict_table_zip_size(index->table),

                                        i, &len, heap);

                                ut_a(data);

                        }


                        rw_lock_x_unlock(dict_index_get_lock(index));


                        if (!row) {

                                goto func_exit;

                        }

                } else {

                        data = rec_get_nth_field(mrec, offsets, i, &len);

                }


                dfield_set_data(dfield, data, len);


                /* See if any columns were changed to NULL or NOT NULL. */

                const dict_col_t*       new_col

                        = dict_table_get_nth_col(log->table, col_no);

                ut_ad(new_col->mtype == col->mtype);


                /* Assert that prtype matches except for nullability. */

                ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL));

                ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)

                        & ~DATA_NOT_NULL));


                if (new_col->prtype == col->prtype) {

                        continue;

                }


                if ((new_col->prtype & DATA_NOT_NULL)

                    && dfield_is_null(dfield)) {

                        /* We got a NULL value for a NOT NULL column. */

                        *error = DB_INVALID_NULL;

                        return(NULL);

                }


                /* Adjust the DATA_NOT_NULL flag in the parsed row. */

                dfield_get_type(dfield)->prtype = new_col->prtype;


                ut_ad(dict_col_type_assert_equal(new_col,

                                                 dfield_get_type(dfield)));

        }


func_exit:

        *error = DB_SUCCESS;

        return(row);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

dberr_t

row_log_table_apply_insert_low(

/*===========================*/

        que_thr_t*              thr,

        const dtuple_t*         row,

        trx_id_t                trx_id,

        mem_heap_t*             offsets_heap,

        mem_heap_t*             heap,

        row_merge_dup_t*        dup)

{

        dberr_t         error;

        dtuple_t*       entry;

        const row_log_t*log     = dup->index->online_log;

        dict_index_t*   index   = dict_table_get_first_index(log->table);


        ut_ad(dtuple_validate(row));

        ut_ad(trx_id);


#ifdef ROW_LOG_APPLY_PRINT

        if (row_log_apply_print) {

                fprintf(stderr, "table apply insert "

                        IB_ID_FMT " " IB_ID_FMT "\n",

                        index->table->id, index->id);

                dtuple_print(stderr, row);

        }

#endif /* ROW_LOG_APPLY_PRINT */


        static const ulint      flags

                = (BTR_CREATE_FLAG

                   | BTR_NO_LOCKING_FLAG

                   | BTR_NO_UNDO_LOG_FLAG

                   | BTR_KEEP_SYS_FLAG);


        entry = row_build_index_entry(row, NULL, index, heap);


        error = row_ins_clust_index_entry_low(

                flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr);


        switch (error) {

        case DB_SUCCESS:

                break;

        case DB_SUCCESS_LOCKED_REC:

                /* The row had already been copied to the table. */

                return(DB_SUCCESS);

        default:

                return(error);

        }


        do {

                if (!(index = dict_table_get_next_index(index))) {

                        break;

                }


                if (index->type & DICT_FTS) {

                        continue;

                }


                entry = row_build_index_entry(row, NULL, index, heap);

                error = row_ins_sec_index_entry_low(

                        flags, BTR_MODIFY_TREE,

                        index, offsets_heap, heap, entry, trx_id, thr);

        } while (error == DB_SUCCESS);


        return(error);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

dberr_t

row_log_table_apply_insert(

/*=======================*/

        que_thr_t*              thr,

        const mrec_t*           mrec,

        const ulint*            offsets,

        mem_heap_t*             offsets_heap,

        mem_heap_t*             heap,

        row_merge_dup_t*        dup,

        trx_id_t                trx_id)

{

        const row_log_t*log     = dup->index->online_log;

        dberr_t         error;

        const dtuple_t* row     = row_log_table_apply_convert_mrec(

                mrec, dup->index, offsets, log, heap, trx_id, &error);


        ut_ad(error == DB_SUCCESS || !row);

        /* Handling of duplicate key error requires storing

        of offending key in a record buffer. */

        ut_ad(error != DB_DUPLICATE_KEY);


        if (error != DB_SUCCESS)

                return(error);


        if (row) {

                error = row_log_table_apply_insert_low(

                        thr, row, trx_id, offsets_heap, heap, dup);

                if (error != DB_SUCCESS) {

                        /* Report the erroneous row using the new

                        version of the table. */

                        innobase_row_to_mysql(dup->table, log->table, row);

                }

        }

        return(error);

}


/******************************************************/

static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result))

dberr_t

row_log_table_apply_delete_low(

/*===========================*/

        btr_pcur_t*             pcur,

        const ulint*            offsets,

        const row_ext_t*        save_ext,

        mem_heap_t*             heap,

        mtr_t*                  mtr)

{

        dberr_t         error;

        row_ext_t*      ext;

        dtuple_t*       row;

        dict_index_t*   index   = btr_pcur_get_btr_cur(pcur)->index;


        ut_ad(dict_index_is_clust(index));


#ifdef ROW_LOG_APPLY_PRINT

        if (row_log_apply_print) {

                fprintf(stderr, "table apply delete "

                        IB_ID_FMT " " IB_ID_FMT "\n",

                        index->table->id, index->id);

                rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets);

        }

#endif /* ROW_LOG_APPLY_PRINT */

        if (dict_table_get_next_index(index)) {

                /* Build a row template for purging secondary index entries. */

                row = row_build(

                        ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),

                        offsets, NULL, NULL, NULL,

                        save_ext ? NULL : &ext, heap);

                if (!save_ext) {

                        save_ext = ext;

                }

        } else {

                row = NULL;

        }


        btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),

                                   BTR_CREATE_FLAG, RB_NONE, mtr);

        mtr_commit(mtr);


        if (error != DB_SUCCESS) {

                return(error);

        }


        while ((index = dict_table_get_next_index(index)) != NULL) {

                if (index->type & DICT_FTS) {

                        continue;

                }


                const dtuple_t* entry = row_build_index_entry(

                        row, save_ext, index, heap);

                mtr_start(mtr);

                btr_pcur_open(index, entry, PAGE_CUR_LE,

                              BTR_MODIFY_TREE, pcur, mtr);

#ifdef UNIV_DEBUG

                switch (btr_pcur_get_btr_cur(pcur)->flag) {

                case BTR_CUR_DELETE_REF:

                case BTR_CUR_DEL_MARK_IBUF:

                case BTR_CUR_DELETE_IBUF:

                case BTR_CUR_INSERT_TO_IBUF:

                        /* We did not request buffering. */

                        break;

                case BTR_CUR_HASH:

                case BTR_CUR_HASH_FAIL:

                case BTR_CUR_BINARY:

                        goto flag_ok;

                }

                ut_ad(0);

flag_ok:

#endif /* UNIV_DEBUG */


                if (page_rec_is_infimum(btr_pcur_get_rec(pcur))

                    || btr_pcur_get_low_match(pcur) < index->n_uniq) {

                        /* All secondary index entries should be

                        found, because new_table is being modified by

                        this thread only, and all indexes should be

                        updated in sync. */

                        mtr_commit(mtr);

                        return(DB_INDEX_CORRUPT);

                }


                btr_cur_pessimistic_delete(&error, FALSE,

                                           btr_pcur_get_btr_cur(pcur),

                                           BTR_CREATE_FLAG, RB_NONE, mtr);

                mtr_commit(mtr);

        }


        return(error);

}


/******************************************************/

static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result))

dberr_t

row_log_table_apply_delete(

/*=======================*/

        que_thr_t*              thr,

        ulint                   trx_id_col,

        const mrec_t*           mrec,

        const ulint*            moffsets,

        mem_heap_t*             offsets_heap,

        mem_heap_t*             heap,

        dict_table_t*           new_table,

        const row_ext_t*        save_ext)

{

        dict_index_t*   index = dict_table_get_first_index(new_table);

        dtuple_t*       old_pk;

        mtr_t           mtr;

        btr_pcur_t      pcur;

        ulint*          offsets;


        ut_ad(rec_offs_n_fields(moffsets)

              == dict_index_get_n_unique(index) + 1);

        ut_ad(!rec_offs_any_extern(moffsets));


        /* Convert the row to a search tuple. */

        old_pk = dtuple_create(heap, index->n_uniq + 1);

        dict_index_copy_types(old_pk, index, old_pk->n_fields);

        dtuple_set_n_fields_cmp(old_pk, index->n_uniq);


        for (ulint i = 0; i <= index->n_uniq; i++) {

                ulint           len;

                const void*     field;

                field = rec_get_nth_field(mrec, moffsets, i, &len);

                ut_ad(len != UNIV_SQL_NULL);

                dfield_set_data(dtuple_get_nth_field(old_pk, i),

                                field, len);

        }


        mtr_start(&mtr);

        btr_pcur_open(index, old_pk, PAGE_CUR_LE,

                      BTR_MODIFY_TREE, &pcur, &mtr);

#ifdef UNIV_DEBUG

        switch (btr_pcur_get_btr_cur(&pcur)->flag) {

        case BTR_CUR_DELETE_REF:

        case BTR_CUR_DEL_MARK_IBUF:

        case BTR_CUR_DELETE_IBUF:

        case BTR_CUR_INSERT_TO_IBUF:

                /* We did not request buffering. */

                break;

        case BTR_CUR_HASH:

        case BTR_CUR_HASH_FAIL:

        case BTR_CUR_BINARY:

                goto flag_ok;

        }

        ut_ad(0);

flag_ok:

#endif /* UNIV_DEBUG */


        if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))

            || btr_pcur_get_low_match(&pcur) < index->n_uniq) {

all_done:

                mtr_commit(&mtr);

                /* The record was not found. All done. */

                return(DB_SUCCESS);

        }


        offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL,

                                  ULINT_UNDEFINED, &offsets_heap);

#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG

        ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));

#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */


        /* Only remove the record if DB_TRX_ID matches what was

        buffered. */


        {

                ulint           len;

                const void*     mrec_trx_id

                        = rec_get_nth_field(mrec, moffsets, trx_id_col, &len);

                ut_ad(len == DATA_TRX_ID_LEN);

                const void*     rec_trx_id

                        = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,

                                            trx_id_col, &len);

                ut_ad(len == DATA_TRX_ID_LEN);

                if (memcmp(mrec_trx_id, rec_trx_id, DATA_TRX_ID_LEN)) {

                        goto all_done;

                }

        }


        return(row_log_table_apply_delete_low(&pcur, offsets, save_ext,

                                              heap, &mtr));

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

dberr_t

row_log_table_apply_update(

/*=======================*/

        que_thr_t*              thr,

        ulint                   trx_id_col,

        ulint                   new_trx_id_col,

        const mrec_t*           mrec,

        const ulint*            offsets,

        mem_heap_t*             offsets_heap,

        mem_heap_t*             heap,

        row_merge_dup_t*        dup,

        trx_id_t                trx_id,

        const dtuple_t*         old_pk)

{

        const row_log_t*log     = dup->index->online_log;

        const dtuple_t* row;

        dict_index_t*   index   = dict_table_get_first_index(log->table);

        mtr_t           mtr;

        btr_pcur_t      pcur;

        dberr_t         error;


        ut_ad(dtuple_get_n_fields_cmp(old_pk)

              == dict_index_get_n_unique(index));

        ut_ad(dtuple_get_n_fields(old_pk)

              == dict_index_get_n_unique(index)

              + (dup->index->online_log->same_pk ? 0 : 2));


        row = row_log_table_apply_convert_mrec(

                mrec, dup->index, offsets, log, heap, trx_id, &error);


        ut_ad(error == DB_SUCCESS || !row);

        /* Handling of duplicate key error requires storing

        of offending key in a record buffer. */

        ut_ad(error != DB_DUPLICATE_KEY);


        if (!row) {

                return(error);

        }


        mtr_start(&mtr);

        btr_pcur_open(index, old_pk, PAGE_CUR_LE,

                      BTR_MODIFY_TREE, &pcur, &mtr);

#ifdef UNIV_DEBUG

        switch (btr_pcur_get_btr_cur(&pcur)->flag) {

        case BTR_CUR_DELETE_REF:

        case BTR_CUR_DEL_MARK_IBUF:

        case BTR_CUR_DELETE_IBUF:

        case BTR_CUR_INSERT_TO_IBUF:

                ut_ad(0);/* We did not request buffering. */

        case BTR_CUR_HASH:

        case BTR_CUR_HASH_FAIL:

        case BTR_CUR_BINARY:

                break;

        }

#endif /* UNIV_DEBUG */


        if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))

            || btr_pcur_get_low_match(&pcur) < index->n_uniq) {

                mtr_commit(&mtr);

insert:

                ut_ad(mtr.state == MTR_COMMITTED);

                /* The row was not found. Insert it. */

                error = row_log_table_apply_insert_low(

                        thr, row, trx_id, offsets_heap, heap, dup);

                if (error != DB_SUCCESS) {

err_exit:

                        /* Report the erroneous row using the new

                        version of the table. */

                        innobase_row_to_mysql(dup->table, log->table, row);

                }


                return(error);

        }


        /* Update the record. */

        ulint*          cur_offsets     = rec_get_offsets(

                btr_pcur_get_rec(&pcur),

                index, NULL, ULINT_UNDEFINED, &offsets_heap);


        dtuple_t*       entry   = row_build_index_entry(

                row, NULL, index, heap);

        const upd_t*    update  = row_upd_build_difference_binary(

                index, entry, btr_pcur_get_rec(&pcur), cur_offsets,

                false, NULL, heap);


        error = DB_SUCCESS;


        if (!update->n_fields) {

                /* Nothing to do. */

                goto func_exit;

        }


        if (rec_offs_any_extern(cur_offsets)) {

                /* If the record contains any externally stored

                columns, perform the update by delete and insert,

                because we will not write any undo log that would

                allow purge to free any orphaned externally stored

                columns. */

delete_insert:

                error = row_log_table_apply_delete_low(

                        &pcur, cur_offsets, NULL, heap, &mtr);

                ut_ad(mtr.state == MTR_COMMITTED);


                if (error != DB_SUCCESS) {

                        goto err_exit;

                }


                goto insert;

        }


        if (upd_get_nth_field(update, 0)->field_no < new_trx_id_col) {

                if (dup->index->online_log->same_pk) {

                        /* The ROW_T_UPDATE log record should only be

                        written when the PRIMARY KEY fields of the

                        record did not change in the old table.  We

                        can only get a change of PRIMARY KEY columns

                        in the rebuilt table if the PRIMARY KEY was

                        redefined (!same_pk). */

                        ut_ad(0);

                        error = DB_CORRUPTION;

                        goto func_exit;

                }


                /* The PRIMARY KEY columns have changed.

                Delete the record with the old PRIMARY KEY value,

                provided that it carries the same

                DB_TRX_ID,DB_ROLL_PTR. Then, insert the new row. */

                ulint           len;

                const byte*     cur_trx_roll    = rec_get_nth_field(

                        mrec, offsets, trx_id_col, &len);

                ut_ad(len == DATA_TRX_ID_LEN);

                const dfield_t* new_trx_roll    = dtuple_get_nth_field(

                        old_pk, new_trx_id_col);

                /* We assume that DB_TRX_ID,DB_ROLL_PTR are stored

                in one contiguous block. */

                ut_ad(rec_get_nth_field(mrec, offsets, trx_id_col + 1, &len)

                      == cur_trx_roll + DATA_TRX_ID_LEN);

                ut_ad(len == DATA_ROLL_PTR_LEN);

                ut_ad(new_trx_roll->len == DATA_TRX_ID_LEN);

                ut_ad(dtuple_get_nth_field(old_pk, new_trx_id_col + 1)

                      -> len == DATA_ROLL_PTR_LEN);

                ut_ad(static_cast<const byte*>(

                              dtuple_get_nth_field(old_pk, new_trx_id_col + 1)

                              ->data)

                      == static_cast<const byte*>(new_trx_roll->data)

                      + DATA_TRX_ID_LEN);


                if (!memcmp(cur_trx_roll, new_trx_roll->data,

                            DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {

                        /* The old row exists. Remove it. */

                        goto delete_insert;

                }


                /* Unless we called row_log_table_apply_delete_low(),

                this will likely cause a duplicate key error. */

                mtr_commit(&mtr);

                goto insert;

        }


        dtuple_t*       old_row;

        row_ext_t*      old_ext;


        if (dict_table_get_next_index(index)) {

                /* Construct the row corresponding to the old value of

                the record. */

                old_row = row_build(

                        ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),

                        cur_offsets, NULL, NULL, NULL, &old_ext, heap);

                ut_ad(old_row);

#ifdef ROW_LOG_APPLY_PRINT

                if (row_log_apply_print) {

                        fprintf(stderr, "table apply update "

                                IB_ID_FMT " " IB_ID_FMT "\n",

                                index->table->id, index->id);

                        dtuple_print(stderr, old_row);

                        dtuple_print(stderr, row);

                }

#endif /* ROW_LOG_APPLY_PRINT */

        } else {

                old_row = NULL;

                old_ext = NULL;

        }


        big_rec_t*      big_rec;


        error = btr_cur_pessimistic_update(

                BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG

                | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG

                | BTR_KEEP_POS_FLAG,

                btr_pcur_get_btr_cur(&pcur),

                &cur_offsets, &offsets_heap, heap, &big_rec,

                update, 0, thr, 0, &mtr);


        if (big_rec) {

                if (error == DB_SUCCESS) {

                        error = btr_store_big_rec_extern_fields(

                                index, btr_pcur_get_block(&pcur),

                                btr_pcur_get_rec(&pcur), cur_offsets,

                                big_rec, &mtr, BTR_STORE_UPDATE);

                }


                dtuple_big_rec_free(big_rec);

        }


        while ((index = dict_table_get_next_index(index)) != NULL) {

                if (error != DB_SUCCESS) {

                        break;

                }


                if (index->type & DICT_FTS) {

                        continue;

                }


                if (!row_upd_changes_ord_field_binary(

                            index, update, thr, old_row, NULL)) {

                        continue;

                }


                mtr_commit(&mtr);


                entry = row_build_index_entry(old_row, old_ext, index, heap);

                if (!entry) {

                        ut_ad(0);

                        return(DB_CORRUPTION);

                }


                mtr_start(&mtr);


                if (ROW_FOUND != row_search_index_entry(

                            index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) {

                        ut_ad(0);

                        error = DB_CORRUPTION;

                        break;

                }


                btr_cur_pessimistic_delete(

                        &error, FALSE, btr_pcur_get_btr_cur(&pcur),

                        BTR_CREATE_FLAG, RB_NONE, &mtr);


                if (error != DB_SUCCESS) {

                        break;

                }


                mtr_commit(&mtr);


                entry = row_build_index_entry(row, NULL, index, heap);

                error = row_ins_sec_index_entry_low(

                        BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG

                        | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,

                        BTR_MODIFY_TREE, index, offsets_heap, heap,

                        entry, trx_id, thr);


                mtr_start(&mtr);

        }


func_exit:

        mtr_commit(&mtr);

        if (error != DB_SUCCESS) {

                goto err_exit;

        }


        return(error);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

const mrec_t*

row_log_table_apply_op(

/*===================*/

        que_thr_t*              thr,

        ulint                   trx_id_col,

        ulint                   new_trx_id_col,

        row_merge_dup_t*        dup,

        dberr_t*                error,

        mem_heap_t*             offsets_heap,

        mem_heap_t*             heap,

        const mrec_t*           mrec,

        const mrec_t*           mrec_end,

        ulint*                  offsets)

{

        row_log_t*      log     = dup->index->online_log;

        dict_index_t*   new_index = dict_table_get_first_index(log->table);

        ulint           extra_size;

        const mrec_t*   next_mrec;

        dtuple_t*       old_pk;

        row_ext_t*      ext;

        ulint           ext_size;


        ut_ad(dict_index_is_clust(dup->index));

        ut_ad(dup->index->table != log->table);

        ut_ad(log->head.total <= log->tail.total);


        *error = DB_SUCCESS;


        /* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */

        if (mrec + 3 >= mrec_end) {

                return(NULL);

        }


        const mrec_t* const mrec_start = mrec;


        switch (*mrec++) {

        default:

                ut_ad(0);

                *error = DB_CORRUPTION;

                return(NULL);

        case ROW_T_INSERT:

                extra_size = *mrec++;


                if (extra_size >= 0x80) {

                        /* Read another byte of extra_size. */


                        extra_size = (extra_size & 0x7f) << 8;

                        extra_size |= *mrec++;

                }


                mrec += extra_size;


                if (mrec > mrec_end) {

                        return(NULL);

                }


                rec_offs_set_n_fields(offsets, dup->index->n_fields);

                rec_init_offsets_temp(mrec, dup->index, offsets);


                next_mrec = mrec + rec_offs_data_size(offsets);


                if (next_mrec > mrec_end) {

                        return(NULL);

                } else {

                        log->head.total += next_mrec - mrec_start;


                        ulint           len;

                        const byte*     db_trx_id

                                = rec_get_nth_field(

                                        mrec, offsets, trx_id_col, &len);

                        ut_ad(len == DATA_TRX_ID_LEN);

                        *error = row_log_table_apply_insert(

                                thr, mrec, offsets, offsets_heap,

                                heap, dup, trx_read_trx_id(db_trx_id));

                }

                break;


        case ROW_T_DELETE:

                /* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */

                if (mrec + 4 >= mrec_end) {

                        return(NULL);

                }


                extra_size = *mrec++;

                ext_size = mach_read_from_2(mrec);

                mrec += 2;

                ut_ad(mrec < mrec_end);


                /* We assume extra_size < 0x100 for the PRIMARY KEY prefix.

                For fixed-length PRIMARY key columns, it is 0. */

                mrec += extra_size;


                rec_offs_set_n_fields(offsets, new_index->n_uniq + 1);

                rec_init_offsets_temp(mrec, new_index, offsets);

                next_mrec = mrec + rec_offs_data_size(offsets) + ext_size;

                if (next_mrec > mrec_end) {

                        return(NULL);

                }


                log->head.total += next_mrec - mrec_start;


                /* If there are external fields, retrieve those logged

                prefix info and reconstruct the row_ext_t */

                if (ext_size) {

                        /* We use memcpy to avoid unaligned

                        access on some non-x86 platforms.*/

                        ext = static_cast<row_ext_t*>(

                                mem_heap_dup(heap,

                                             mrec + rec_offs_data_size(offsets),

                                             ext_size));


                        byte*   ext_start = reinterpret_cast<byte*>(ext);


                        ulint   ext_len = sizeof(*ext)

                                + (ext->n_ext - 1) * sizeof ext->len;


                        ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len);

                        ext_len += ext->n_ext * sizeof(*ext->ext);


                        ext->buf = static_cast<byte*>(ext_start + ext_len);

                } else {

                        ext = NULL;

                }


                *error = row_log_table_apply_delete(

                        thr, new_trx_id_col,

                        mrec, offsets, offsets_heap, heap,

                        log->table, ext);

                break;


        case ROW_T_UPDATE:

                /* Logically, the log entry consists of the

                (PRIMARY KEY,DB_TRX_ID) of the old value (converted

                to the new primary key definition) followed by

                the new value in the old table definition. If the

                definition of the columns belonging to PRIMARY KEY

                is not changed, the log will only contain

                DB_TRX_ID,new_row. */


                if (dup->index->online_log->same_pk) {

                        ut_ad(new_index->n_uniq == dup->index->n_uniq);


                        extra_size = *mrec++;


                        if (extra_size >= 0x80) {

                                /* Read another byte of extra_size. */


                                extra_size = (extra_size & 0x7f) << 8;

                                extra_size |= *mrec++;

                        }


                        mrec += extra_size;


                        if (mrec > mrec_end) {

                                return(NULL);

                        }


                        rec_offs_set_n_fields(offsets, dup->index->n_fields);

                        rec_init_offsets_temp(mrec, dup->index, offsets);


                        next_mrec = mrec + rec_offs_data_size(offsets);


                        if (next_mrec > mrec_end) {

                                return(NULL);

                        }


                        old_pk = dtuple_create(heap, new_index->n_uniq);

                        dict_index_copy_types(

                                old_pk, new_index, old_pk->n_fields);


                        /* Copy the PRIMARY KEY fields from mrec to old_pk. */

                        for (ulint i = 0; i < new_index->n_uniq; i++) {

                                const void*     field;

                                ulint           len;

                                dfield_t*       dfield;


                                ut_ad(!rec_offs_nth_extern(offsets, i));


                                field = rec_get_nth_field(

                                        mrec, offsets, i, &len);

                                ut_ad(len != UNIV_SQL_NULL);


                                dfield = dtuple_get_nth_field(old_pk, i);

                                dfield_set_data(dfield, field, len);

                        }

                } else {

                        /* We assume extra_size < 0x100

                        for the PRIMARY KEY prefix. */

                        mrec += *mrec + 1;


                        if (mrec > mrec_end) {

                                return(NULL);

                        }


                        /* Get offsets for PRIMARY KEY,

                        DB_TRX_ID, DB_ROLL_PTR. */

                        rec_offs_set_n_fields(offsets, new_index->n_uniq + 2);

                        rec_init_offsets_temp(mrec, new_index, offsets);


                        next_mrec = mrec + rec_offs_data_size(offsets);

                        if (next_mrec + 2 > mrec_end) {

                                return(NULL);

                        }


                        /* Copy the PRIMARY KEY fields and

                        DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */

                        old_pk = dtuple_create(heap, new_index->n_uniq + 2);

                        dict_index_copy_types(old_pk, new_index,

                                              old_pk->n_fields);


                        for (ulint i = 0;

                             i < dict_index_get_n_unique(new_index) + 2;

                             i++) {

                                const void*     field;

                                ulint           len;

                                dfield_t*       dfield;


                                ut_ad(!rec_offs_nth_extern(offsets, i));


                                field = rec_get_nth_field(

                                        mrec, offsets, i, &len);

                                ut_ad(len != UNIV_SQL_NULL);


                                dfield = dtuple_get_nth_field(old_pk, i);

                                dfield_set_data(dfield, field, len);

                        }


                        mrec = next_mrec;


                        /* Fetch the new value of the row as it was

                        in the old table definition. */

                        extra_size = *mrec++;


                        if (extra_size >= 0x80) {

                                /* Read another byte of extra_size. */


                                extra_size = (extra_size & 0x7f) << 8;

                                extra_size |= *mrec++;

                        }


                        mrec += extra_size;


                        if (mrec > mrec_end) {

                                return(NULL);

                        }


                        rec_offs_set_n_fields(offsets, dup->index->n_fields);

                        rec_init_offsets_temp(mrec, dup->index, offsets);


                        next_mrec = mrec + rec_offs_data_size(offsets);


                        if (next_mrec > mrec_end) {

                                return(NULL);

                        }

                }


                ut_ad(next_mrec <= mrec_end);

                log->head.total += next_mrec - mrec_start;

                dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);


                {

                        ulint           len;

                        const byte*     db_trx_id

                                = rec_get_nth_field(

                                        mrec, offsets, trx_id_col, &len);

                        ut_ad(len == DATA_TRX_ID_LEN);

                        *error = row_log_table_apply_update(

                                thr, trx_id_col, new_trx_id_col,

                                mrec, offsets, offsets_heap,

                                heap, dup, trx_read_trx_id(db_trx_id), old_pk);

                }


                break;

        }


        ut_ad(log->head.total <= log->tail.total);

        mem_heap_empty(offsets_heap);

        mem_heap_empty(heap);

        return(next_mrec);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

dberr_t

row_log_table_apply_ops(

/*====================*/

        que_thr_t*      thr,

        row_merge_dup_t*dup)

{

        dberr_t         error;

        const mrec_t*   mrec            = NULL;

        const mrec_t*   next_mrec;

        const mrec_t*   mrec_end        = NULL; /* silence bogus warning */

        const mrec_t*   next_mrec_end;

        mem_heap_t*     heap;

        mem_heap_t*     offsets_heap;

        ulint*          offsets;

        bool            has_index_lock;

        dict_index_t*   index           = const_cast<dict_index_t*>(

                dup->index);

        dict_table_t*   new_table       = index->online_log->table;

        dict_index_t*   new_index       = dict_table_get_first_index(

                new_table);

        const ulint     i               = 1 + REC_OFFS_HEADER_SIZE

                + ut_max(dict_index_get_n_fields(index),

                         dict_index_get_n_unique(new_index) + 2);

        const ulint     trx_id_col      = dict_col_get_clust_pos(

                dict_table_get_sys_col(index->table, DATA_TRX_ID), index);

        const ulint     new_trx_id_col  = dict_col_get_clust_pos(

                dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);

        trx_t*          trx             = thr_get_trx(thr);


        ut_ad(dict_index_is_clust(index));

        ut_ad(dict_index_is_online_ddl(index));

        ut_ad(trx->mysql_thd);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(!dict_index_is_online_ddl(new_index));

        ut_ad(trx_id_col > 0);

        ut_ad(trx_id_col != ULINT_UNDEFINED);

        ut_ad(new_trx_id_col > 0);

        ut_ad(new_trx_id_col != ULINT_UNDEFINED);


        UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);


        offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));

        offsets[0] = i;

        offsets[1] = dict_index_get_n_fields(index);


        heap = mem_heap_create(UNIV_PAGE_SIZE);

        offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);

        has_index_lock = true;


next_block:

        ut_ad(has_index_lock);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(index->online_log->head.bytes == 0);


        if (trx_is_interrupted(trx)) {

                goto interrupted;

        }


        if (dict_index_is_corrupted(index)) {

                error = DB_INDEX_CORRUPT;

                goto func_exit;

        }


        ut_ad(dict_index_is_online_ddl(index));


        error = index->online_log->error;


        if (error != DB_SUCCESS) {

                goto func_exit;

        }


        if (UNIV_UNLIKELY(index->online_log->head.blocks

                          > index->online_log->tail.blocks)) {

unexpected_eof:

                fprintf(stderr, "InnoDB: unexpected end of temporary file"

                        " for table %s\n", index->table_name);

corruption:

                error = DB_CORRUPTION;

                goto func_exit;

        }


        if (index->online_log->head.blocks

            == index->online_log->tail.blocks) {

                if (index->online_log->head.blocks) {

#ifdef HAVE_FTRUNCATE

                        /* Truncate the file in order to save space. */

                        ftruncate(index->online_log->fd, 0);

#endif /* HAVE_FTRUNCATE */

                        index->online_log->head.blocks

                                = index->online_log->tail.blocks = 0;

                }


                next_mrec = index->online_log->tail.block;

                next_mrec_end = next_mrec + index->online_log->tail.bytes;


                if (next_mrec_end == next_mrec) {

                        /* End of log reached. */

all_done:

                        ut_ad(has_index_lock);

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->tail.blocks == 0);

                        index->online_log->head.bytes = 0;

                        index->online_log->tail.bytes = 0;

                        error = DB_SUCCESS;

                        goto func_exit;

                }

        } else {

                os_offset_t     ofs;

                ibool           success;


                ofs = (os_offset_t) index->online_log->head.blocks

                        * srv_sort_buf_size;


                ut_ad(has_index_lock);

                has_index_lock = false;

                rw_lock_x_unlock(dict_index_get_lock(index));


                log_free_check();


                ut_ad(dict_index_is_online_ddl(index));


                success = os_file_read_no_error_handling(

                        OS_FILE_FROM_FD(index->online_log->fd),

                        index->online_log->head.block, ofs,

                        srv_sort_buf_size);


                if (!success) {

                        fprintf(stderr, "InnoDB: unable to read temporary file"

                                " for table %s\n", index->table_name);

                        goto corruption;

                }


#ifdef POSIX_FADV_DONTNEED

                /* Each block is read exactly once.  Free up the file cache. */

                posix_fadvise(index->online_log->fd,

                              ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);

#endif /* POSIX_FADV_DONTNEED */

#if 0 //def FALLOC_FL_PUNCH_HOLE

                /* Try to deallocate the space for the file on disk.

                This should work on ext4 on Linux 2.6.39 and later,

                and be ignored when the operation is unsupported. */

                fallocate(index->online_log->fd,

                          FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,

                          ofs, srv_buf_size);

#endif /* FALLOC_FL_PUNCH_HOLE */


                next_mrec = index->online_log->head.block;

                next_mrec_end = next_mrec + srv_sort_buf_size;

        }


        /* This read is not protected by index->online_log->mutex for

        performance reasons. We will eventually notice any error that

        was flagged by a DML thread. */

        error = index->online_log->error;


        if (error != DB_SUCCESS) {

                goto func_exit;

        }


        if (mrec) {

                /* A partial record was read from the previous block.

                Copy the temporary buffer full, as we do not know the

                length of the record. Parse subsequent records from

                the bigger buffer index->online_log->head.block

                or index->online_log->tail.block. */


                ut_ad(mrec == index->online_log->head.buf);

                ut_ad(mrec_end > mrec);

                ut_ad(mrec_end < (&index->online_log->head.buf)[1]);


                memcpy((mrec_t*) mrec_end, next_mrec,

                       (&index->online_log->head.buf)[1] - mrec_end);

                mrec = row_log_table_apply_op(

                        thr, trx_id_col, new_trx_id_col,

                        dup, &error, offsets_heap, heap,

                        index->online_log->head.buf,

                        (&index->online_log->head.buf)[1], offsets);

                if (error != DB_SUCCESS) {

                        goto func_exit;

                } else if (UNIV_UNLIKELY(mrec == NULL)) {

                        /* The record was not reassembled properly. */

                        goto corruption;

                }

                /* The record was previously found out to be

                truncated. Now that the parse buffer was extended,

                it should proceed beyond the old end of the buffer. */

                ut_a(mrec > mrec_end);


                index->online_log->head.bytes = mrec - mrec_end;

                next_mrec += index->online_log->head.bytes;

        }


        ut_ad(next_mrec <= next_mrec_end);

        /* The following loop must not be parsing the temporary

        buffer, but head.block or tail.block. */


        /* mrec!=NULL means that the next record starts from the

        middle of the block */

        ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));


#ifdef UNIV_DEBUG

        if (next_mrec_end == index->online_log->head.block

            + srv_sort_buf_size) {

                /* If tail.bytes == 0, next_mrec_end can also be at

                the end of tail.block. */

                if (index->online_log->tail.bytes == 0) {

                        ut_ad(next_mrec == next_mrec_end);

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->head.bytes == 0);

                } else {

                        ut_ad(next_mrec == index->online_log->head.block

                              + index->online_log->head.bytes);

                        ut_ad(index->online_log->tail.blocks

                              > index->online_log->head.blocks);

                }

        } else if (next_mrec_end == index->online_log->tail.block

                   + index->online_log->tail.bytes) {

                ut_ad(next_mrec == index->online_log->tail.block

                      + index->online_log->head.bytes);

                ut_ad(index->online_log->tail.blocks == 0);

                ut_ad(index->online_log->head.blocks == 0);

                ut_ad(index->online_log->head.bytes

                      <= index->online_log->tail.bytes);

        } else {

                ut_error;

        }

#endif /* UNIV_DEBUG */


        mrec_end = next_mrec_end;


        while (!trx_is_interrupted(trx)) {

                mrec = next_mrec;

                ut_ad(mrec < mrec_end);


                if (!has_index_lock) {

                        /* We are applying operations from a different

                        block than the one that is being written to.

                        We do not hold index->lock in order to

                        allow other threads to concurrently buffer

                        modifications. */

                        ut_ad(mrec >= index->online_log->head.block);

                        ut_ad(mrec_end == index->online_log->head.block

                              + srv_sort_buf_size);

                        ut_ad(index->online_log->head.bytes

                              < srv_sort_buf_size);


                        /* Take the opportunity to do a redo log

                        checkpoint if needed. */

                        log_free_check();

                } else {

                        /* We are applying operations from the last block.

                        Do not allow other threads to buffer anything,

                        so that we can finally catch up and synchronize. */

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(mrec_end == index->online_log->tail.block

                              + index->online_log->tail.bytes);

                        ut_ad(mrec >= index->online_log->tail.block);

                }


                /* This read is not protected by index->online_log->mutex

                for performance reasons. We will eventually notice any

                error that was flagged by a DML thread. */

                error = index->online_log->error;


                if (error != DB_SUCCESS) {

                        goto func_exit;

                }


                next_mrec = row_log_table_apply_op(

                        thr, trx_id_col, new_trx_id_col,

                        dup, &error, offsets_heap, heap,

                        mrec, mrec_end, offsets);


                if (error != DB_SUCCESS) {

                        goto func_exit;

                } else if (next_mrec == next_mrec_end) {

                        /* The record happened to end on a block boundary.

                        Do we have more blocks left? */

                        if (has_index_lock) {

                                /* The index will be locked while

                                applying the last block. */

                                goto all_done;

                        }


                        mrec = NULL;

process_next_block:

                        rw_lock_x_lock(dict_index_get_lock(index));

                        has_index_lock = true;


                        index->online_log->head.bytes = 0;

                        index->online_log->head.blocks++;

                        goto next_block;

                } else if (next_mrec != NULL) {

                        ut_ad(next_mrec < next_mrec_end);

                        index->online_log->head.bytes += next_mrec - mrec;

                } else if (has_index_lock) {

                        /* When mrec is within tail.block, it should

                        be a complete record, because we are holding

                        index->lock and thus excluding the writer. */

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(mrec_end == index->online_log->tail.block

                              + index->online_log->tail.bytes);

                        ut_ad(0);

                        goto unexpected_eof;

                } else {

                        memcpy(index->online_log->head.buf, mrec,

                               mrec_end - mrec);

                        mrec_end += index->online_log->head.buf - mrec;

                        mrec = index->online_log->head.buf;

                        goto process_next_block;

                }

        }


interrupted:

        error = DB_INTERRUPTED;

func_exit:

        if (!has_index_lock) {

                rw_lock_x_lock(dict_index_get_lock(index));

        }


        mem_heap_free(offsets_heap);

        mem_heap_free(heap);

        ut_free(offsets);

        return(error);

}


/******************************************************/

UNIV_INTERN

dberr_t

row_log_table_apply(

/*================*/

        que_thr_t*      thr,

        dict_table_t*   old_table,

        struct TABLE*   table)

{

        dberr_t         error;

        dict_index_t*   clust_index;


        thr_get_trx(thr)->error_key_num = 0;


#ifdef UNIV_SYNC_DEBUG

        ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));

#endif /* UNIV_SYNC_DEBUG */

        clust_index = dict_table_get_first_index(old_table);


        rw_lock_x_lock(dict_index_get_lock(clust_index));


        if (!clust_index->online_log) {

                ut_ad(dict_index_get_online_status(clust_index)

                      == ONLINE_INDEX_COMPLETE);

                /* This function should not be called unless

                rebuilding a table online. Build in some fault

                tolerance. */

                ut_ad(0);

                error = DB_ERROR;

        } else {

                row_merge_dup_t dup = {

                        clust_index, table,

                        clust_index->online_log->col_map, 0

                };


                error = row_log_table_apply_ops(thr, &dup);


                ut_ad(error != DB_SUCCESS

                      || clust_index->online_log->head.total

                      == clust_index->online_log->tail.total);

        }


        rw_lock_x_unlock(dict_index_get_lock(clust_index));

        return(error);

}


/******************************************************/

UNIV_INTERN

bool

row_log_allocate(

/*=============*/

        dict_index_t*   index,

        dict_table_t*   table,

        bool            same_pk,

        const dtuple_t* add_cols,

        const ulint*    col_map)

{

        byte*           buf;

        row_log_t*      log;

        ulint           size;

        DBUG_ENTER("row_log_allocate");


        ut_ad(!dict_index_is_online_ddl(index));

        ut_ad(dict_index_is_clust(index) == !!table);

        ut_ad(!table || index->table != table);

        ut_ad(same_pk || table);

        ut_ad(!table || col_map);

        ut_ad(!add_cols || col_map);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        size = 2 * srv_sort_buf_size + sizeof *log;

        buf = (byte*) os_mem_alloc_large(&size);

        if (!buf) {

                DBUG_RETURN(false);

        }


        log = (row_log_t*) &buf[2 * srv_sort_buf_size];

        log->size = size;

        log->fd = row_merge_file_create_low();

        if (log->fd < 0) {

                os_mem_free_large(buf, size);

                DBUG_RETURN(false);

        }

        mutex_create(index_online_log_key, &log->mutex,

                     SYNC_INDEX_ONLINE_LOG);

        log->blobs = NULL;

        log->table = table;

        log->same_pk = same_pk;

        log->add_cols = add_cols;

        log->col_map = col_map;

        log->error = DB_SUCCESS;

        log->max_trx = 0;

        log->head.block = buf;

        log->tail.block = buf + srv_sort_buf_size;

        log->tail.blocks = log->tail.bytes = 0;

        log->tail.total = 0;

        log->head.blocks = log->head.bytes = 0;

        log->head.total = 0;

        dict_index_set_online_status(index, ONLINE_INDEX_CREATION);

        index->online_log = log;


        /* While we might be holding an exclusive data dictionary lock

        here, in row_log_abort_sec() we will not always be holding it. Use

        atomic operations in both cases. */

        MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);


        DBUG_RETURN(true);

}


/******************************************************/

UNIV_INTERN

void

row_log_free(

/*=========*/

        row_log_t*&     log)

{

        MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);


        delete log->blobs;

        row_merge_file_destroy_low(log->fd);

        mutex_free(&log->mutex);

        os_mem_free_large(log->head.block, log->size);

        log = 0;

}


/******************************************************/

UNIV_INTERN

trx_id_t

row_log_get_max_trx(

/*================*/

        dict_index_t*   index)

{

        ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);

#ifdef UNIV_SYNC_DEBUG

        ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)

               && mutex_own(&index->online_log->mutex))

              || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        return(index->online_log->max_trx);

}


/******************************************************/

static __attribute__((nonnull))

void

row_log_apply_op_low(

/*=================*/

        dict_index_t*   index,

        row_merge_dup_t*dup,

        dberr_t*        error,

        mem_heap_t*     offsets_heap,

        bool            has_index_lock,

        enum row_op     op,

        trx_id_t        trx_id,

        const dtuple_t* entry)

{

        mtr_t           mtr;

        btr_cur_t       cursor;

        ulint*          offsets = NULL;


        ut_ad(!dict_index_is_clust(index));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)

              == has_index_lock);

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(!dict_index_is_corrupted(index));

        ut_ad(trx_id != 0 || op == ROW_OP_DELETE);


        mtr_start(&mtr);


        /* We perform the pessimistic variant of the operations if we

        already hold index->lock exclusively. First, search the

        record. The operation may already have been performed,

        depending on when the row in the clustered index was

        scanned. */

        btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,

                                    has_index_lock

                                    ? BTR_MODIFY_TREE

                                    : BTR_MODIFY_LEAF,

                                    &cursor, 0, __FILE__, __LINE__,

                                    &mtr);


        ut_ad(dict_index_get_n_unique(index) > 0);

        /* This test is somewhat similar to row_ins_must_modify_rec(),

        but not identical for unique secondary indexes. */

        if (cursor.low_match >= dict_index_get_n_unique(index)

            && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {

                /* We have a matching record. */

                bool    exists  = (cursor.low_match

                                   == dict_index_get_n_fields(index));

#ifdef UNIV_DEBUG

                rec_t*  rec     = btr_cur_get_rec(&cursor);

                ut_ad(page_rec_is_user_rec(rec));

                ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));

#endif /* UNIV_DEBUG */


                ut_ad(exists || dict_index_is_unique(index));


                switch (op) {

                case ROW_OP_DELETE:

                        if (!exists) {

                                /* The record was already deleted. */

                                goto func_exit;

                        }


                        if (btr_cur_optimistic_delete(

                                    &cursor, BTR_CREATE_FLAG, &mtr)) {

                                *error = DB_SUCCESS;

                                break;

                        }


                        if (!has_index_lock) {

                                /* This needs a pessimistic operation.

                                Lock the index tree exclusively. */

                                mtr_commit(&mtr);

                                mtr_start(&mtr);

                                btr_cur_search_to_nth_level(

                                        index, 0, entry, PAGE_CUR_LE,

                                        BTR_MODIFY_TREE, &cursor, 0,

                                        __FILE__, __LINE__, &mtr);


                                /* No other thread than the current one

                                is allowed to modify the index tree.

                                Thus, the record should still exist. */

                                ut_ad(cursor.low_match

                                      >= dict_index_get_n_fields(index));

                                ut_ad(page_rec_is_user_rec(

                                              btr_cur_get_rec(&cursor)));

                        }


                        /* As there are no externally stored fields in

                        a secondary index record, the parameter

                        rb_ctx = RB_NONE will be ignored. */


                        btr_cur_pessimistic_delete(

                                error, FALSE, &cursor,

                                BTR_CREATE_FLAG, RB_NONE, &mtr);

                        break;

                case ROW_OP_INSERT:

                        if (exists) {

                                /* The record already exists. There

                                is nothing to be inserted. */

                                goto func_exit;

                        }


                        if (dtuple_contains_null(entry)) {

                                /* The UNIQUE KEY columns match, but

                                there is a NULL value in the key, and

                                NULL!=NULL. */

                                goto insert_the_rec;

                        }


                        /* Duplicate key error */

                        ut_ad(dict_index_is_unique(index));

                        row_merge_dup_report(dup, entry->fields);

                        goto func_exit;

                }

        } else {

                switch (op) {

                        rec_t*          rec;

                        big_rec_t*      big_rec;

                case ROW_OP_DELETE:

                        /* The record does not exist. */

                        goto func_exit;

                case ROW_OP_INSERT:

                        if (dict_index_is_unique(index)

                            && (cursor.up_match

                                >= dict_index_get_n_unique(index)

                                || cursor.low_match

                                >= dict_index_get_n_unique(index))

                            && (!index->n_nullable

                                || !dtuple_contains_null(entry))) {

                                /* Duplicate key */

                                row_merge_dup_report(dup, entry->fields);

                                goto func_exit;

                        }

insert_the_rec:

                        /* Insert the record. As we are inserting into

                        a secondary index, there cannot be externally

                        stored columns (!big_rec). */

                        *error = btr_cur_optimistic_insert(

                                BTR_NO_UNDO_LOG_FLAG

                                | BTR_NO_LOCKING_FLAG

                                | BTR_CREATE_FLAG,

                                &cursor, &offsets, &offsets_heap,

                                const_cast<dtuple_t*>(entry),

                                &rec, &big_rec, 0, NULL, &mtr);

                        ut_ad(!big_rec);

                        if (*error != DB_FAIL) {

                                break;

                        }


                        if (!has_index_lock) {

                                /* This needs a pessimistic operation.

                                Lock the index tree exclusively. */

                                mtr_commit(&mtr);

                                mtr_start(&mtr);

                                btr_cur_search_to_nth_level(

                                        index, 0, entry, PAGE_CUR_LE,

                                        BTR_MODIFY_TREE, &cursor, 0,

                                        __FILE__, __LINE__, &mtr);

                        }


                        /* We already determined that the

                        record did not exist. No other thread

                        than the current one is allowed to

                        modify the index tree. Thus, the

                        record should still not exist. */


                        *error = btr_cur_pessimistic_insert(

                                BTR_NO_UNDO_LOG_FLAG

                                | BTR_NO_LOCKING_FLAG

                                | BTR_CREATE_FLAG,

                                &cursor, &offsets, &offsets_heap,

                                const_cast<dtuple_t*>(entry),

                                &rec, &big_rec,

                                0, NULL, &mtr);

                        ut_ad(!big_rec);

                        break;

                }

                mem_heap_empty(offsets_heap);

        }


        if (*error == DB_SUCCESS && trx_id) {

                page_update_max_trx_id(btr_cur_get_block(&cursor),

                                       btr_cur_get_page_zip(&cursor),

                                       trx_id, &mtr);

        }


func_exit:

        mtr_commit(&mtr);

}


/******************************************************/

static __attribute__((nonnull, warn_unused_result))

const mrec_t*

row_log_apply_op(

/*=============*/

        dict_index_t*   index,

        row_merge_dup_t*dup,

        dberr_t*        error,

        mem_heap_t*     offsets_heap,

        mem_heap_t*     heap,

        bool            has_index_lock,

        const mrec_t*   mrec,

        const mrec_t*   mrec_end,

        ulint*          offsets)

{

        enum row_op     op;

        ulint           extra_size;

        ulint           data_size;

        ulint           n_ext;

        dtuple_t*       entry;

        trx_id_t        trx_id;


        /* Online index creation is only used for secondary indexes. */

        ut_ad(!dict_index_is_clust(index));

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)

              == has_index_lock);

#endif /* UNIV_SYNC_DEBUG */


        if (dict_index_is_corrupted(index)) {

                *error = DB_INDEX_CORRUPT;

                return(NULL);

        }


        *error = DB_SUCCESS;


        if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {

                return(NULL);

        }


        switch (*mrec) {

        case ROW_OP_INSERT:

                if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {

                        return(NULL);

                }


                op = static_cast<enum row_op>(*mrec++);

                trx_id = trx_read_trx_id(mrec);

                mrec += DATA_TRX_ID_LEN;

                break;

        case ROW_OP_DELETE:

                op = static_cast<enum row_op>(*mrec++);

                trx_id = 0;

                break;

        default:

corrupted:

                ut_ad(0);

                *error = DB_CORRUPTION;

                return(NULL);

        }


        extra_size = *mrec++;


        ut_ad(mrec < mrec_end);


        if (extra_size >= 0x80) {

                /* Read another byte of extra_size. */


                extra_size = (extra_size & 0x7f) << 8;

                extra_size |= *mrec++;

        }


        mrec += extra_size;


        if (mrec > mrec_end) {

                return(NULL);

        }


        rec_init_offsets_temp(mrec, index, offsets);


        if (rec_offs_any_extern(offsets)) {

                /* There should never be any externally stored fields

                in a secondary index, which is what online index

                creation is used for. Therefore, the log file must be

                corrupted. */

                goto corrupted;

        }


        data_size = rec_offs_data_size(offsets);


        mrec += data_size;


        if (mrec > mrec_end) {

                return(NULL);

        }


        entry = row_rec_to_index_entry_low(

                mrec - data_size, index, offsets, &n_ext, heap);

        /* Online index creation is only implemented for secondary

        indexes, which never contain off-page columns. */

        ut_ad(n_ext == 0);

#ifdef ROW_LOG_APPLY_PRINT

        if (row_log_apply_print) {

                fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ",

                        index->id, trx_id,

                        unsigned (op), unsigned (has_index_lock));

                for (const byte* m = mrec - data_size; m < mrec; m++) {

                        fprintf(stderr, "%02x", *m);

                }

                putc('\n', stderr);

        }

#endif /* ROW_LOG_APPLY_PRINT */

        row_log_apply_op_low(index, dup, error, offsets_heap,

                             has_index_lock, op, trx_id, entry);

        return(mrec);

}


/******************************************************/

static __attribute__((nonnull))

dberr_t

row_log_apply_ops(

/*==============*/

        trx_t*          trx,

        dict_index_t*   index,

        row_merge_dup_t*dup)

{

        dberr_t         error;

        const mrec_t*   mrec    = NULL;

        const mrec_t*   next_mrec;

        const mrec_t*   mrec_end= NULL; /* silence bogus warning */

        const mrec_t*   next_mrec_end;

        mem_heap_t*     offsets_heap;

        mem_heap_t*     heap;

        ulint*          offsets;

        bool            has_index_lock;

        const ulint     i       = 1 + REC_OFFS_HEADER_SIZE

                + dict_index_get_n_fields(index);


        ut_ad(dict_index_is_online_ddl(index));

        ut_ad(*index->name == TEMP_INDEX_PREFIX);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(index->online_log);

        UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);


        offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));

        offsets[0] = i;

        offsets[1] = dict_index_get_n_fields(index);


        offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);

        heap = mem_heap_create(UNIV_PAGE_SIZE);

        has_index_lock = true;


next_block:

        ut_ad(has_index_lock);

#ifdef UNIV_SYNC_DEBUG

        ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));

#endif /* UNIV_SYNC_DEBUG */

        ut_ad(index->online_log->head.bytes == 0);


        if (trx_is_interrupted(trx)) {

                goto interrupted;

        }


        if (dict_index_is_corrupted(index)) {

                error = DB_INDEX_CORRUPT;

                goto func_exit;

        }


        if (UNIV_UNLIKELY(index->online_log->head.blocks

                          > index->online_log->tail.blocks)) {

unexpected_eof:

                fprintf(stderr, "InnoDB: unexpected end of temporary file"

                        " for index %s\n", index->name + 1);

corruption:

                error = DB_CORRUPTION;

                goto func_exit;

        }


        if (index->online_log->head.blocks

            == index->online_log->tail.blocks) {

                if (index->online_log->head.blocks) {

#ifdef HAVE_FTRUNCATE

                        /* Truncate the file in order to save space. */

                        ftruncate(index->online_log->fd, 0);

#endif /* HAVE_FTRUNCATE */

                        index->online_log->head.blocks

                                = index->online_log->tail.blocks = 0;

                }


                next_mrec = index->online_log->tail.block;

                next_mrec_end = next_mrec + index->online_log->tail.bytes;


                if (next_mrec_end == next_mrec) {

                        /* End of log reached. */

all_done:

                        ut_ad(has_index_lock);

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->tail.blocks == 0);

                        error = DB_SUCCESS;

                        goto func_exit;

                }

        } else {

                os_offset_t     ofs;

                ibool           success;


                ofs = (os_offset_t) index->online_log->head.blocks

                        * srv_sort_buf_size;


                ut_ad(has_index_lock);

                has_index_lock = false;

                rw_lock_x_unlock(dict_index_get_lock(index));


                log_free_check();


                success = os_file_read_no_error_handling(

                        OS_FILE_FROM_FD(index->online_log->fd),

                        index->online_log->head.block, ofs,

                        srv_sort_buf_size);


                if (!success) {

                        fprintf(stderr, "InnoDB: unable to read temporary file"

                                " for index %s\n", index->name + 1);

                        goto corruption;

                }


#ifdef POSIX_FADV_DONTNEED

                /* Each block is read exactly once.  Free up the file cache. */

                posix_fadvise(index->online_log->fd,

                              ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);

#endif /* POSIX_FADV_DONTNEED */

#if 0 //def FALLOC_FL_PUNCH_HOLE

                /* Try to deallocate the space for the file on disk.

                This should work on ext4 on Linux 2.6.39 and later,

                and be ignored when the operation is unsupported. */

                fallocate(index->online_log->fd,

                          FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,

                          ofs, srv_buf_size);

#endif /* FALLOC_FL_PUNCH_HOLE */


                next_mrec = index->online_log->head.block;

                next_mrec_end = next_mrec + srv_sort_buf_size;

        }


        if (mrec) {

                /* A partial record was read from the previous block.

                Copy the temporary buffer full, as we do not know the

                length of the record. Parse subsequent records from

                the bigger buffer index->online_log->head.block

                or index->online_log->tail.block. */


                ut_ad(mrec == index->online_log->head.buf);

                ut_ad(mrec_end > mrec);

                ut_ad(mrec_end < (&index->online_log->head.buf)[1]);


                memcpy((mrec_t*) mrec_end, next_mrec,

                       (&index->online_log->head.buf)[1] - mrec_end);

                mrec = row_log_apply_op(

                        index, dup, &error, offsets_heap, heap,

                        has_index_lock, index->online_log->head.buf,

                        (&index->online_log->head.buf)[1], offsets);

                if (error != DB_SUCCESS) {

                        goto func_exit;

                } else if (UNIV_UNLIKELY(mrec == NULL)) {

                        /* The record was not reassembled properly. */

                        goto corruption;

                }

                /* The record was previously found out to be

                truncated. Now that the parse buffer was extended,

                it should proceed beyond the old end of the buffer. */

                ut_a(mrec > mrec_end);


                index->online_log->head.bytes = mrec - mrec_end;

                next_mrec += index->online_log->head.bytes;

        }


        ut_ad(next_mrec <= next_mrec_end);

        /* The following loop must not be parsing the temporary

        buffer, but head.block or tail.block. */


        /* mrec!=NULL means that the next record starts from the

        middle of the block */

        ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));


#ifdef UNIV_DEBUG

        if (next_mrec_end == index->online_log->head.block

            + srv_sort_buf_size) {

                /* If tail.bytes == 0, next_mrec_end can also be at

                the end of tail.block. */

                if (index->online_log->tail.bytes == 0) {

                        ut_ad(next_mrec == next_mrec_end);

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->head.bytes == 0);

                } else {

                        ut_ad(next_mrec == index->online_log->head.block

                              + index->online_log->head.bytes);

                        ut_ad(index->online_log->tail.blocks

                              > index->online_log->head.blocks);

                }

        } else if (next_mrec_end == index->online_log->tail.block

                   + index->online_log->tail.bytes) {

                ut_ad(next_mrec == index->online_log->tail.block

                      + index->online_log->head.bytes);

                ut_ad(index->online_log->tail.blocks == 0);

                ut_ad(index->online_log->head.blocks == 0);

                ut_ad(index->online_log->head.bytes

                      <= index->online_log->tail.bytes);

        } else {

                ut_error;

        }

#endif /* UNIV_DEBUG */


        mrec_end = next_mrec_end;


        while (!trx_is_interrupted(trx)) {

                mrec = next_mrec;

                ut_ad(mrec < mrec_end);


                if (!has_index_lock) {

                        /* We are applying operations from a different

                        block than the one that is being written to.

                        We do not hold index->lock in order to

                        allow other threads to concurrently buffer

                        modifications. */

                        ut_ad(mrec >= index->online_log->head.block);

                        ut_ad(mrec_end == index->online_log->head.block

                              + srv_sort_buf_size);

                        ut_ad(index->online_log->head.bytes

                              < srv_sort_buf_size);


                        /* Take the opportunity to do a redo log

                        checkpoint if needed. */

                        log_free_check();

                } else {

                        /* We are applying operations from the last block.

                        Do not allow other threads to buffer anything,

                        so that we can finally catch up and synchronize. */

                        ut_ad(index->online_log->head.blocks == 0);

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(mrec_end == index->online_log->tail.block

                              + index->online_log->tail.bytes);

                        ut_ad(mrec >= index->online_log->tail.block);

                }


                next_mrec = row_log_apply_op(

                        index, dup, &error, offsets_heap, heap,

                        has_index_lock, mrec, mrec_end, offsets);


                if (error != DB_SUCCESS) {

                        goto func_exit;

                } else if (next_mrec == next_mrec_end) {

                        /* The record happened to end on a block boundary.

                        Do we have more blocks left? */

                        if (has_index_lock) {

                                /* The index will be locked while

                                applying the last block. */

                                goto all_done;

                        }


                        mrec = NULL;

process_next_block:

                        rw_lock_x_lock(dict_index_get_lock(index));

                        has_index_lock = true;


                        index->online_log->head.bytes = 0;

                        index->online_log->head.blocks++;

                        goto next_block;

                } else if (next_mrec != NULL) {

                        ut_ad(next_mrec < next_mrec_end);

                        index->online_log->head.bytes += next_mrec - mrec;

                } else if (has_index_lock) {

                        /* When mrec is within tail.block, it should

                        be a complete record, because we are holding

                        index->lock and thus excluding the writer. */

                        ut_ad(index->online_log->tail.blocks == 0);

                        ut_ad(mrec_end == index->online_log->tail.block

                              + index->online_log->tail.bytes);

                        ut_ad(0);

                        goto unexpected_eof;

                } else {

                        memcpy(index->online_log->head.buf, mrec,

                               mrec_end - mrec);

                        mrec_end += index->online_log->head.buf - mrec;

                        mrec = index->online_log->head.buf;

                        goto process_next_block;

                }

        }


interrupted:

        error = DB_INTERRUPTED;

func_exit:

        if (!has_index_lock) {

                rw_lock_x_lock(dict_index_get_lock(index));

        }


        switch (error) {

        case DB_SUCCESS:

                break;

        case DB_INDEX_CORRUPT:

                if (((os_offset_t) index->online_log->tail.blocks + 1)

                    * srv_sort_buf_size >= srv_online_max_size) {

                        /* The log file grew too big. */

                        error = DB_ONLINE_LOG_TOO_BIG;

                }

                /* fall through */

        default:

                /* We set the flag directly instead of invoking

                dict_set_corrupted_index_cache_only(index) here,

                because the index is not "public" yet. */

                index->type |= DICT_CORRUPT;

        }


        mem_heap_free(heap);

        mem_heap_free(offsets_heap);

        ut_free(offsets);

        return(error);

}


/******************************************************/

UNIV_INTERN

dberr_t

row_log_apply(

/*==========*/

        trx_t*          trx,

        dict_index_t*   index,

        struct TABLE*   table)

{

        dberr_t         error;

        row_log_t*      log;

        row_merge_dup_t dup = { index, table, NULL, 0 };

        DBUG_ENTER("row_log_apply");


        ut_ad(dict_index_is_online_ddl(index));

        ut_ad(!dict_index_is_clust(index));


        log_free_check();


        rw_lock_x_lock(dict_index_get_lock(index));


        if (!dict_table_is_corrupted(index->table)) {

                error = row_log_apply_ops(trx, index, &dup);

        } else {

                error = DB_SUCCESS;

        }


        if (error != DB_SUCCESS || dup.n_dup) {

                ut_a(!dict_table_is_discarded(index->table));

                /* We set the flag directly instead of invoking

                dict_set_corrupted_index_cache_only(index) here,

                because the index is not "public" yet. */

                index->type |= DICT_CORRUPT;

                index->table->drop_aborted = TRUE;


                if (error == DB_SUCCESS) {

                        error = DB_DUPLICATE_KEY;

                }


                dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);

        } else {

                dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);

        }


        log = index->online_log;

        index->online_log = NULL;

        /* We could remove the TEMP_INDEX_PREFIX and update the data

        dictionary to say that this index is complete, if we had

        access to the .frm file here.  If the server crashes before

        all requested indexes have been created, this completed index

        will be dropped. */

        rw_lock_x_unlock(dict_index_get_lock(index));


        row_log_free(log);


        DBUG_RETURN(error);

}