MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
row0merge.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
27 #include "row0merge.h"
28 #include "row0ext.h"
29 #include "row0log.h"
30 #include "row0ins.h"
31 #include "row0sel.h"
32 #include "dict0crea.h"
33 #include "trx0purge.h"
34 #include "lock0lock.h"
35 #include "pars0pars.h"
36 #include "ut0sort.h"
37 #include "row0ftsort.h"
38 #include "row0import.h"
39 #include "handler0alter.h"
40 #include "ha_prototypes.h"
41 
42 /* Ignore posix_fadvise() on those platforms where it does not exist */
43 #if defined __WIN__
44 # define posix_fadvise(fd, offset, len, advice) /* nothing */
45 #endif /* __WIN__ */
46 
47 #ifdef UNIV_DEBUG
48 
49 /* @{ */
51 static ibool row_merge_print_read;
53 static ibool row_merge_print_write;
56 static ibool row_merge_print_block;
58 static ibool row_merge_print_block_read;
60 static ibool row_merge_print_block_write;
61 /* @} */
62 #endif /* UNIV_DEBUG */
63 
64 /* Whether to disable file system cache */
65 UNIV_INTERN char srv_disable_sort_file_cache;
66 
67 #ifdef UNIV_DEBUG
68 /******************************************************/
70 static __attribute__((nonnull))
71 void
72 row_merge_tuple_print(
73 /*==================*/
74  FILE* f,
75  const mtuple_t* entry,
76  ulint n_fields)
77 {
78  ulint j;
79 
80  for (j = 0; j < n_fields; j++) {
81  const dfield_t* field = &entry->fields[j];
82 
83  if (dfield_is_null(field)) {
84  fputs("\n NULL;", f);
85  } else {
86  ulint field_len = dfield_get_len(field);
87  ulint len = ut_min(field_len, 20);
88  if (dfield_is_ext(field)) {
89  fputs("\nE", f);
90  } else {
91  fputs("\n ", f);
92  }
93  ut_print_buf(f, dfield_get_data(field), len);
94  if (len != field_len) {
95  fprintf(f, " (total %lu bytes)", field_len);
96  }
97  }
98  }
99  putc('\n', f);
100 }
101 #endif /* UNIV_DEBUG */
102 
103 /******************************************************/
105 static __attribute__((nonnull))
106 void
107 row_merge_buf_encode(
108 /*=================*/
109  byte** b,
112  const mtuple_t* entry,
114  ulint n_fields)
116 {
117  ulint size;
118  ulint extra_size;
119 
121  index, entry->fields, n_fields, &extra_size);
122  ut_ad(size >= extra_size);
123 
124  /* Encode extra_size + 1 */
125  if (extra_size + 1 < 0x80) {
126  *(*b)++ = (byte) (extra_size + 1);
127  } else {
128  ut_ad((extra_size + 1) < 0x8000);
129  *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
130  *(*b)++ = (byte) (extra_size + 1);
131  }
132 
133  rec_convert_dtuple_to_temp(*b + extra_size, index,
134  entry->fields, n_fields);
135 
136  *b += size;
137 }
138 
139 /******************************************************/
142 static __attribute__((malloc, nonnull))
144 row_merge_buf_create_low(
145 /*=====================*/
146  mem_heap_t* heap,
148  ulint max_tuples,
150  ulint buf_size)
152 {
154 
155  ut_ad(max_tuples > 0);
156 
157  ut_ad(max_tuples <= srv_sort_buf_size);
158 
159  buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
160  buf->heap = heap;
161  buf->index = index;
162  buf->max_tuples = max_tuples;
163  buf->tuples = static_cast<mtuple_t*>(
164  ut_malloc(2 * max_tuples * sizeof *buf->tuples));
165  buf->tmp_tuples = buf->tuples + max_tuples;
166 
167  return(buf);
168 }
169 
170 /******************************************************/
173 UNIV_INTERN
176 /*=================*/
177  dict_index_t* index)
178 {
180  ulint max_tuples;
181  ulint buf_size;
182  mem_heap_t* heap;
183 
184  max_tuples = srv_sort_buf_size
185  / ut_max(1, dict_index_get_min_size(index));
186 
187  buf_size = (sizeof *buf);
188 
189  heap = mem_heap_create(buf_size);
190 
191  buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
192 
193  return(buf);
194 }
195 
196 /******************************************************/
199 UNIV_INTERN
202 /*================*/
203  row_merge_buf_t* buf)
204 {
205  ulint buf_size = sizeof *buf;
206  ulint max_tuples = buf->max_tuples;
207  mem_heap_t* heap = buf->heap;
208  dict_index_t* index = buf->index;
209  mtuple_t* tuples = buf->tuples;
210 
211  mem_heap_empty(heap);
212 
213  buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
214  buf->heap = heap;
215  buf->index = index;
216  buf->max_tuples = max_tuples;
217  buf->tuples = tuples;
218  buf->tmp_tuples = buf->tuples + max_tuples;
219 
220  return(buf);
221 }
222 
223 /******************************************************/
225 UNIV_INTERN
226 void
228 /*===============*/
229  row_merge_buf_t* buf)
230 {
231  ut_free(buf->tuples);
232  mem_heap_free(buf->heap);
233 }
234 
235 /******************************************************/
238 static
239 ulint
240 row_merge_buf_add(
241 /*==============*/
242  row_merge_buf_t* buf,
243  dict_index_t* fts_index,
244  const dict_table_t* old_table,
245  fts_psort_t* psort_info,
246  const dtuple_t* row,
247  const row_ext_t* ext,
249  doc_id_t* doc_id)
251 {
252  ulint i;
253  const dict_index_t* index;
254  mtuple_t* entry;
255  dfield_t* field;
256  const dict_field_t* ifield;
257  ulint n_fields;
258  ulint data_size;
259  ulint extra_size;
260  ulint bucket = 0;
261  doc_id_t write_doc_id;
262  ulint n_row_added = 0;
263  DBUG_ENTER("row_merge_buf_add");
264 
265  if (buf->n_tuples >= buf->max_tuples) {
266  DBUG_RETURN(0);
267  }
268 
269  DBUG_EXECUTE_IF(
270  "ib_row_merge_buf_add_two",
271  if (buf->n_tuples >= 2) DBUG_RETURN(0););
272 
273  UNIV_PREFETCH_R(row->fields);
274 
275  /* If we are building FTS index, buf->index points to
276  the 'fts_sort_idx', and real FTS index is stored in
277  fts_index */
278  index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
279 
280  n_fields = dict_index_get_n_fields(index);
281 
282  entry = &buf->tuples[buf->n_tuples];
283  field = entry->fields = static_cast<dfield_t*>(
284  mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
285 
286  data_size = 0;
287  extra_size = UT_BITS_IN_BYTES(index->n_nullable);
288 
289  ifield = dict_index_get_nth_field(index, 0);
290 
291  for (i = 0; i < n_fields; i++, field++, ifield++) {
292  ulint len;
293  const dict_col_t* col;
294  ulint col_no;
295  ulint fixed_len;
296  const dfield_t* row_field;
297 
298  col = ifield->col;
299  col_no = dict_col_get_no(col);
300 
301  /* Process the Doc ID column */
302  if (*doc_id > 0
303  && col_no == index->table->fts->doc_col) {
304  fts_write_doc_id((byte*) &write_doc_id, *doc_id);
305 
306  /* Note: field->data now points to a value on the
307  stack: &write_doc_id after dfield_set_data(). Because
308  there is only one doc_id per row, it shouldn't matter.
309  We allocate a new buffer before we leave the function
310  later below. */
311 
313  field, &write_doc_id, sizeof(write_doc_id));
314 
315  field->type.mtype = ifield->col->mtype;
316  field->type.prtype = ifield->col->prtype;
317  field->type.mbminmaxlen = DATA_MBMINMAXLEN(0, 0);
318  field->type.len = ifield->col->len;
319  } else {
320  row_field = dtuple_get_nth_field(row, col_no);
321 
322  dfield_copy(field, row_field);
323 
324  /* Tokenize and process data for FTS */
325  if (index->type & DICT_FTS) {
326  fts_doc_item_t* doc_item;
327  byte* value;
328 
329  /* fetch Doc ID if it already exists
330  in the row, and not supplied by the
331  caller. Even if the value column is
332  NULL, we still need to get the Doc
333  ID so to maintain the correct max
334  Doc ID */
335  if (*doc_id == 0) {
336  const dfield_t* doc_field;
337  doc_field = dtuple_get_nth_field(
338  row,
339  index->table->fts->doc_col);
340  *doc_id = (doc_id_t) mach_read_from_8(
341  static_cast<byte*>(
342  dfield_get_data(doc_field)));
343 
344  if (*doc_id == 0) {
345  ib_logf(IB_LOG_LEVEL_WARN,
346  "FTS Doc ID is zero. "
347  "Record Skipped");
348  DBUG_RETURN(0);
349  }
350  }
351 
352  if (dfield_is_null(field)) {
353  n_row_added = 1;
354  continue;
355  }
356 
357  doc_item = static_cast<fts_doc_item_t*>(
359  buf->heap,
360  sizeof(*doc_item)));
361 
362  value = static_cast<byte*>(
363  ut_malloc(field->len));
364  memcpy(value, field->data, field->len);
365  field->data = value;
366 
367  doc_item->field = field;
368  doc_item->doc_id = *doc_id;
369 
370  bucket = *doc_id % fts_sort_pll_degree;
371 
373  doc_list,
374  psort_info[bucket].fts_doc_list,
375  doc_item);
376  n_row_added = 1;
377  continue;
378  }
379  }
380 
381  len = dfield_get_len(field);
382 
383  if (dfield_is_null(field)) {
384  ut_ad(!(col->prtype & DATA_NOT_NULL));
385  continue;
386  } else if (!ext) {
387  } else if (dict_index_is_clust(index)) {
388  /* Flag externally stored fields. */
389  const byte* buf = row_ext_lookup(ext, col_no,
390  &len);
391  if (UNIV_LIKELY_NULL(buf)) {
392  ut_a(buf != field_ref_zero);
393  if (i < dict_index_get_n_unique(index)) {
394  dfield_set_data(field, buf, len);
395  } else {
396  dfield_set_ext(field);
397  len = dfield_get_len(field);
398  }
399  }
400  } else {
401  const byte* buf = row_ext_lookup(ext, col_no,
402  &len);
403  if (UNIV_LIKELY_NULL(buf)) {
404  ut_a(buf != field_ref_zero);
405  dfield_set_data(field, buf, len);
406  }
407  }
408 
409  /* If a column prefix index, take only the prefix */
410 
411  if (ifield->prefix_len) {
413  col->prtype,
414  col->mbminmaxlen,
415  ifield->prefix_len,
416  len,
417  static_cast<char*>(dfield_get_data(field)));
418  dfield_set_len(field, len);
419  }
420 
421  ut_ad(len <= col->len || col->mtype == DATA_BLOB);
422 
423  fixed_len = ifield->fixed_len;
424  if (fixed_len && !dict_table_is_comp(index->table)
425  && DATA_MBMINLEN(col->mbminmaxlen)
426  != DATA_MBMAXLEN(col->mbminmaxlen)) {
427  /* CHAR in ROW_FORMAT=REDUNDANT is always
428  fixed-length, but in the temporary file it is
429  variable-length for variable-length character
430  sets. */
431  fixed_len = 0;
432  }
433 
434  if (fixed_len) {
435 #ifdef UNIV_DEBUG
436  ulint mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
437  ulint mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
438 
439  /* len should be between size calcualted base on
440  mbmaxlen and mbminlen */
441  ut_ad(len <= fixed_len);
442  ut_ad(!mbmaxlen || len >= mbminlen
443  * (fixed_len / mbmaxlen));
444 
445  ut_ad(!dfield_is_ext(field));
446 #endif /* UNIV_DEBUG */
447  } else if (dfield_is_ext(field)) {
448  extra_size += 2;
449  } else if (len < 128
450  || (col->len < 256 && col->mtype != DATA_BLOB)) {
451  extra_size++;
452  } else {
453  /* For variable-length columns, we look up the
454  maximum length from the column itself. If this
455  is a prefix index column shorter than 256 bytes,
456  this will waste one byte. */
457  extra_size += 2;
458  }
459  data_size += len;
460  }
461 
462  /* If this is FTS index, we already populated the sort buffer, return
463  here */
464  if (index->type & DICT_FTS) {
465  DBUG_RETURN(n_row_added);
466  }
467 
468 #ifdef UNIV_DEBUG
469  {
470  ulint size;
471  ulint extra;
472 
474  index, entry->fields, n_fields, &extra);
475 
476  ut_ad(data_size + extra_size == size);
477  ut_ad(extra_size == extra);
478  }
479 #endif /* UNIV_DEBUG */
480 
481  /* Add to the total size of the record in row_merge_block_t
482  the encoded length of extra_size and the extra bytes (extra_size).
483  See row_merge_buf_write() for the variable-length encoding
484  of extra_size. */
485  data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
486 
487  ut_ad(data_size < srv_sort_buf_size);
488 
489  /* Reserve one byte for the end marker of row_merge_block_t. */
490  if (buf->total_size + data_size >= srv_sort_buf_size - 1) {
491  DBUG_RETURN(0);
492  }
493 
494  buf->total_size += data_size;
495  buf->n_tuples++;
496  n_row_added++;
497 
498  field = entry->fields;
499 
500  /* Copy the data fields. */
501 
502  do {
503  dfield_dup(field++, buf->heap);
504  } while (--n_fields);
505 
506  DBUG_RETURN(n_row_added);
507 }
508 
509 /*************************************************************/
511 UNIV_INTERN
512 void
514 /*=================*/
515  row_merge_dup_t* dup,
516  const dfield_t* entry)
517 {
518  if (!dup->n_dup++) {
519  /* Only report the first duplicate record,
520  but count all duplicate records. */
521  innobase_fields_to_mysql(dup->table, dup->index, entry);
522  }
523 }
524 
525 /*************************************************************/
528 static __attribute__((warn_unused_result))
529 int
530 row_merge_tuple_cmp(
531 /*================*/
532  ulint n_uniq,
533  ulint n_field,
534  const mtuple_t& a,
535  const mtuple_t& b,
536  row_merge_dup_t* dup)
538 {
539  int cmp;
540  const dfield_t* af = a.fields;
541  const dfield_t* bf = b.fields;
542  ulint n = n_uniq;
543 
544  ut_ad(n_uniq > 0);
545  ut_ad(n_uniq <= n_field);
546 
547  /* Compare the fields of the tuples until a difference is
548  found or we run out of fields to compare. If !cmp at the
549  end, the tuples are equal. */
550  do {
551  cmp = cmp_dfield_dfield(af++, bf++);
552  } while (!cmp && --n);
553 
554  if (cmp) {
555  return(cmp);
556  }
557 
558  if (dup) {
559  /* Report a duplicate value error if the tuples are
560  logically equal. NULL columns are logically inequal,
561  although they are equal in the sorting order. Find
562  out if any of the fields are NULL. */
563  for (const dfield_t* df = a.fields; df != af; df++) {
564  if (dfield_is_null(df)) {
565  goto no_report;
566  }
567  }
568 
569  row_merge_dup_report(dup, a.fields);
570  }
571 
572 no_report:
573  /* The n_uniq fields were equal, but we compare all fields so
574  that we will get the same (internal) order as in the B-tree. */
575  for (n = n_field - n_uniq + 1; --n; ) {
576  cmp = cmp_dfield_dfield(af++, bf++);
577  if (cmp) {
578  return(cmp);
579  }
580  }
581 
582  /* This should never be reached, except in a secondary index
583  when creating a secondary index and a PRIMARY KEY, and there
584  is a duplicate in the PRIMARY KEY that has not been detected
585  yet. Internally, an index must never contain duplicates. */
586  return(cmp);
587 }
588 
595 #define row_merge_tuple_sort_ctx(tuples, aux, low, high) \
596  row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high)
597 
602 #define row_merge_tuple_cmp_ctx(a,b) \
603  row_merge_tuple_cmp(n_uniq, n_field, a, b, dup)
604 
605 /**********************************************************************/
607 static __attribute__((nonnull(4,5)))
608 void
609 row_merge_tuple_sort(
610 /*=================*/
611  ulint n_uniq,
612  ulint n_field,
613  row_merge_dup_t* dup,
615  mtuple_t* tuples,
616  mtuple_t* aux,
617  ulint low,
619  ulint high)
621 {
622  ut_ad(n_field > 0);
623  ut_ad(n_uniq <= n_field);
624 
625  UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
626  tuples, aux, low, high, row_merge_tuple_cmp_ctx);
627 }
628 
629 /******************************************************/
631 UNIV_INTERN
632 void
634 /*===============*/
635  row_merge_buf_t* buf,
636  row_merge_dup_t* dup)
638 {
639  row_merge_tuple_sort(dict_index_get_n_unique(buf->index),
641  dup,
642  buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
643 }
644 
645 /******************************************************/
647 UNIV_INTERN
648 void
650 /*================*/
651  const row_merge_buf_t* buf,
652  const merge_file_t* of UNIV_UNUSED,
655 {
656  const dict_index_t* index = buf->index;
657  ulint n_fields= dict_index_get_n_fields(index);
658  byte* b = &block[0];
659 
660  for (ulint i = 0; i < buf->n_tuples; i++) {
661  const mtuple_t* entry = &buf->tuples[i];
662 
663  row_merge_buf_encode(&b, index, entry, n_fields);
664  ut_ad(b < &block[srv_sort_buf_size]);
665 #ifdef UNIV_DEBUG
666  if (row_merge_print_write) {
667  fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
668  (void*) b, of->fd, (ulong) of->offset,
669  (ulong) i);
670  row_merge_tuple_print(stderr, entry, n_fields);
671  }
672 #endif /* UNIV_DEBUG */
673  }
674 
675  /* Write an "end-of-chunk" marker. */
676  ut_a(b < &block[srv_sort_buf_size]);
677  ut_a(b == &block[0] + buf->total_size);
678  *b++ = 0;
679 #ifdef UNIV_DEBUG_VALGRIND
680  /* The rest of the block is uninitialized. Initialize it
681  to avoid bogus warnings. */
682  memset(b, 0xff, &block[srv_sort_buf_size] - b);
683 #endif /* UNIV_DEBUG_VALGRIND */
684 #ifdef UNIV_DEBUG
685  if (row_merge_print_write) {
686  fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
687  (void*) b, of->fd, (ulong) of->offset);
688  }
689 #endif /* UNIV_DEBUG */
690 }
691 
692 /******************************************************/
696 static
697 mem_heap_t*
698 row_merge_heap_create(
699 /*==================*/
700  const dict_index_t* index,
701  mrec_buf_t** buf,
702  ulint** offsets1,
703  ulint** offsets2)
704 {
705  ulint i = 1 + REC_OFFS_HEADER_SIZE
706  + dict_index_get_n_fields(index);
707  mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
708  + 3 * sizeof **buf);
709 
710  *buf = static_cast<mrec_buf_t*>(
711  mem_heap_alloc(heap, 3 * sizeof **buf));
712  *offsets1 = static_cast<ulint*>(
713  mem_heap_alloc(heap, i * sizeof **offsets1));
714  *offsets2 = static_cast<ulint*>(
715  mem_heap_alloc(heap, i * sizeof **offsets2));
716 
717  (*offsets1)[0] = (*offsets2)[0] = i;
718  (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
719 
720  return(heap);
721 }
722 
723 /********************************************************************/
726 UNIV_INTERN
727 ibool
729 /*===========*/
730  int fd,
731  ulint offset,
734  row_merge_block_t* buf)
735 {
736  os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size;
737  ibool success;
738 
739  DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE););
740 
741 #ifdef UNIV_DEBUG
742  if (row_merge_print_block_read) {
743  fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
744  fd, (ulong) offset);
745  }
746 #endif /* UNIV_DEBUG */
747 
748 #ifdef UNIV_DEBUG
749  if (row_merge_print_block_read) {
750  fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
751  fd, (ulong) offset);
752  }
753 #endif /* UNIV_DEBUG */
754 
755  success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
756  ofs, srv_sort_buf_size);
757 #ifdef POSIX_FADV_DONTNEED
758  /* Each block is read exactly once. Free up the file cache. */
759  posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
760 #endif /* POSIX_FADV_DONTNEED */
761 
762  if (UNIV_UNLIKELY(!success)) {
763  ut_print_timestamp(stderr);
764  fprintf(stderr,
765  " InnoDB: failed to read merge block at "UINT64PF"\n",
766  ofs);
767  }
768 
769  return(UNIV_LIKELY(success));
770 }
771 
772 /********************************************************************/
775 UNIV_INTERN
776 ibool
778 /*============*/
779  int fd,
780  ulint offset,
782  const void* buf)
783 {
784  size_t buf_len = srv_sort_buf_size;
785  os_offset_t ofs = buf_len * (os_offset_t) offset;
786  ibool ret;
787 
788  DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE););
789 
790  ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len);
791 
792 #ifdef UNIV_DEBUG
793  if (row_merge_print_block_write) {
794  fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
795  fd, (ulong) offset);
796  }
797 #endif /* UNIV_DEBUG */
798 
799 #ifdef POSIX_FADV_DONTNEED
800  /* The block will be needed on the next merge pass,
801  but it can be evicted from the file cache meanwhile. */
802  posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
803 #endif /* POSIX_FADV_DONTNEED */
804 
805  return(UNIV_LIKELY(ret));
806 }
807 
808 /********************************************************************/
811 UNIV_INTERN
812 const byte*
814 /*===============*/
815  row_merge_block_t* block,
816  mrec_buf_t* buf,
817  const byte* b,
818  const dict_index_t* index,
819  int fd,
820  ulint* foffs,
821  const mrec_t** mrec,
824  ulint* offsets)
825 {
826  ulint extra_size;
827  ulint data_size;
828  ulint avail_size;
829 
830  ut_ad(block);
831  ut_ad(buf);
832  ut_ad(b >= &block[0]);
833  ut_ad(b < &block[srv_sort_buf_size]);
834  ut_ad(index);
835  ut_ad(foffs);
836  ut_ad(mrec);
837  ut_ad(offsets);
838 
839  ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
840  + dict_index_get_n_fields(index));
841 
842  extra_size = *b++;
843 
844  if (UNIV_UNLIKELY(!extra_size)) {
845  /* End of list */
846  *mrec = NULL;
847 #ifdef UNIV_DEBUG
848  if (row_merge_print_read) {
849  fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
850  (const void*) b, (const void*) block,
851  fd, (ulong) *foffs);
852  }
853 #endif /* UNIV_DEBUG */
854  return(NULL);
855  }
856 
857  if (extra_size >= 0x80) {
858  /* Read another byte of extra_size. */
859 
860  if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
861  if (!row_merge_read(fd, ++(*foffs), block)) {
862 err_exit:
863  /* Signal I/O error. */
864  *mrec = b;
865  return(NULL);
866  }
867 
868  /* Wrap around to the beginning of the buffer. */
869  b = &block[0];
870  }
871 
872  extra_size = (extra_size & 0x7f) << 8;
873  extra_size |= *b++;
874  }
875 
876  /* Normalize extra_size. Above, value 0 signals "end of list". */
877  extra_size--;
878 
879  /* Read the extra bytes. */
880 
881  if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
882  /* The record spans two blocks. Copy the entire record
883  to the auxiliary buffer and handle this as a special
884  case. */
885 
886  avail_size = &block[srv_sort_buf_size] - b;
887  ut_ad(avail_size < sizeof *buf);
888  memcpy(*buf, b, avail_size);
889 
890  if (!row_merge_read(fd, ++(*foffs), block)) {
891 
892  goto err_exit;
893  }
894 
895  /* Wrap around to the beginning of the buffer. */
896  b = &block[0];
897 
898  /* Copy the record. */
899  memcpy(*buf + avail_size, b, extra_size - avail_size);
900  b += extra_size - avail_size;
901 
902  *mrec = *buf + extra_size;
903 
904  rec_init_offsets_temp(*mrec, index, offsets);
905 
906  data_size = rec_offs_data_size(offsets);
907 
908  /* These overflows should be impossible given that
909  records are much smaller than either buffer, and
910  the record starts near the beginning of each buffer. */
911  ut_a(extra_size + data_size < sizeof *buf);
912  ut_a(b + data_size < &block[srv_sort_buf_size]);
913 
914  /* Copy the data bytes. */
915  memcpy(*buf + extra_size, b, data_size);
916  b += data_size;
917 
918  goto func_exit;
919  }
920 
921  *mrec = b + extra_size;
922 
923  rec_init_offsets_temp(*mrec, index, offsets);
924 
925  data_size = rec_offs_data_size(offsets);
926  ut_ad(extra_size + data_size < sizeof *buf);
927 
928  b += extra_size + data_size;
929 
930  if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
931  /* The record fits entirely in the block.
932  This is the normal case. */
933  goto func_exit;
934  }
935 
936  /* The record spans two blocks. Copy it to buf. */
937 
938  b -= extra_size + data_size;
939  avail_size = &block[srv_sort_buf_size] - b;
940  memcpy(*buf, b, avail_size);
941  *mrec = *buf + extra_size;
942 #ifdef UNIV_DEBUG
943  /* We cannot invoke rec_offs_make_valid() here, because there
944  are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
945  Similarly, rec_offs_validate() would fail, because it invokes
946  rec_get_status(). */
947  offsets[2] = (ulint) *mrec;
948  offsets[3] = (ulint) index;
949 #endif /* UNIV_DEBUG */
950 
951  if (!row_merge_read(fd, ++(*foffs), block)) {
952 
953  goto err_exit;
954  }
955 
956  /* Wrap around to the beginning of the buffer. */
957  b = &block[0];
958 
959  /* Copy the rest of the record. */
960  memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
961  b += extra_size + data_size - avail_size;
962 
963 func_exit:
964 #ifdef UNIV_DEBUG
965  if (row_merge_print_read) {
966  fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
967  (const void*) b, (const void*) block,
968  fd, (ulong) *foffs);
969  rec_print_comp(stderr, *mrec, offsets);
970  putc('\n', stderr);
971  }
972 #endif /* UNIV_DEBUG */
973 
974  return(b);
975 }
976 
977 /********************************************************************/
979 static
980 void
981 row_merge_write_rec_low(
982 /*====================*/
983  byte* b,
984  ulint e,
985 #ifdef UNIV_DEBUG
986  ulint size,
987  int fd,
988  ulint foffs,
989 #endif /* UNIV_DEBUG */
990  const mrec_t* mrec,
991  const ulint* offsets)
992 #ifndef UNIV_DEBUG
993 # define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
994  row_merge_write_rec_low(b, e, mrec, offsets)
995 #endif /* !UNIV_DEBUG */
996 {
997 #ifdef UNIV_DEBUG
998  const byte* const end = b + size;
999  ut_ad(e == rec_offs_extra_size(offsets) + 1);
1000 
1001  if (row_merge_print_write) {
1002  fprintf(stderr, "row_merge_write %p,%d,%lu ",
1003  (void*) b, fd, (ulong) foffs);
1004  rec_print_comp(stderr, mrec, offsets);
1005  putc('\n', stderr);
1006  }
1007 #endif /* UNIV_DEBUG */
1008 
1009  if (e < 0x80) {
1010  *b++ = (byte) e;
1011  } else {
1012  *b++ = (byte) (0x80 | (e >> 8));
1013  *b++ = (byte) e;
1014  }
1015 
1016  memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
1017  ut_ad(b + rec_offs_size(offsets) == end);
1018 }
1019 
1020 /********************************************************************/
1023 static
1024 byte*
1025 row_merge_write_rec(
1026 /*================*/
1027  row_merge_block_t* block,
1028  mrec_buf_t* buf,
1029  byte* b,
1030  int fd,
1031  ulint* foffs,
1032  const mrec_t* mrec,
1033  const ulint* offsets)
1034 {
1035  ulint extra_size;
1036  ulint size;
1037  ulint avail_size;
1038 
1039  ut_ad(block);
1040  ut_ad(buf);
1041  ut_ad(b >= &block[0]);
1042  ut_ad(b < &block[srv_sort_buf_size]);
1043  ut_ad(mrec);
1044  ut_ad(foffs);
1045  ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
1046  ut_ad(mrec < buf[0] || mrec > buf[1]);
1047 
1048  /* Normalize extra_size. Value 0 signals "end of list". */
1049  extra_size = rec_offs_extra_size(offsets) + 1;
1050 
1051  size = extra_size + (extra_size >= 0x80)
1052  + rec_offs_data_size(offsets);
1053 
1054  if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
1055  /* The record spans two blocks.
1056  Copy it to the temporary buffer first. */
1057  avail_size = &block[srv_sort_buf_size] - b;
1058 
1059  row_merge_write_rec_low(buf[0],
1060  extra_size, size, fd, *foffs,
1061  mrec, offsets);
1062 
1063  /* Copy the head of the temporary buffer, write
1064  the completed block, and copy the tail of the
1065  record to the head of the new block. */
1066  memcpy(b, buf[0], avail_size);
1067 
1068  if (!row_merge_write(fd, (*foffs)++, block)) {
1069  return(NULL);
1070  }
1071 
1072  UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
1073 
1074  /* Copy the rest. */
1075  b = &block[0];
1076  memcpy(b, buf[0] + avail_size, size - avail_size);
1077  b += size - avail_size;
1078  } else {
1079  row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
1080  mrec, offsets);
1081  b += size;
1082  }
1083 
1084  return(b);
1085 }
1086 
1087 /********************************************************************/
1090 static
1091 byte*
1092 row_merge_write_eof(
1093 /*================*/
1094  row_merge_block_t* block,
1095  byte* b,
1096  int fd,
1097  ulint* foffs)
1098 {
1099  ut_ad(block);
1100  ut_ad(b >= &block[0]);
1101  ut_ad(b < &block[srv_sort_buf_size]);
1102  ut_ad(foffs);
1103 #ifdef UNIV_DEBUG
1104  if (row_merge_print_write) {
1105  fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
1106  (void*) b, (void*) block, fd, (ulong) *foffs);
1107  }
1108 #endif /* UNIV_DEBUG */
1109 
1110  *b++ = 0;
1111  UNIV_MEM_ASSERT_RW(&block[0], b - &block[0]);
1112  UNIV_MEM_ASSERT_W(&block[0], srv_sort_buf_size);
1113 #ifdef UNIV_DEBUG_VALGRIND
1114  /* The rest of the block is uninitialized. Initialize it
1115  to avoid bogus warnings. */
1116  memset(b, 0xff, &block[srv_sort_buf_size] - b);
1117 #endif /* UNIV_DEBUG_VALGRIND */
1118 
1119  if (!row_merge_write(fd, (*foffs)++, block)) {
1120  return(NULL);
1121  }
1122 
1123  UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
1124  return(&block[0]);
1125 }
1126 
1127 /********************************************************************/
1131 static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result))
1132 dberr_t
1133 row_merge_read_clustered_index(
1134 /*===========================*/
1135  trx_t* trx,
1136  struct TABLE* table,
1138  const dict_table_t* old_table,
1140  const dict_table_t* new_table,
1143  bool online,
1145  dict_index_t** index,
1146  dict_index_t* fts_sort_idx,
1149  fts_psort_t* psort_info,
1152  merge_file_t* files,
1153  const ulint* key_numbers,
1155  ulint n_index,
1156  const dtuple_t* add_cols,
1159  const ulint* col_map,
1162  ulint add_autoinc,
1166  ib_sequence_t& sequence,
1167  row_merge_block_t* block)
1168 {
1169  dict_index_t* clust_index; /* Clustered index */
1170  mem_heap_t* row_heap; /* Heap memory to create
1171  clustered index tuples */
1172  row_merge_buf_t** merge_buf; /* Temporary list for records*/
1173  btr_pcur_t pcur; /* Cursor on the clustered
1174  index */
1175  mtr_t mtr; /* Mini transaction */
1176  dberr_t err = DB_SUCCESS;/* Return code */
1177  ulint n_nonnull = 0; /* number of columns
1178  changed to NOT NULL */
1179  ulint* nonnull = NULL; /* NOT NULL columns */
1180  dict_index_t* fts_index = NULL;/* FTS index */
1181  doc_id_t doc_id = 0;
1182  doc_id_t max_doc_id = 0;
1183  ibool add_doc_id = FALSE;
1184  os_event_t fts_parallel_sort_event = NULL;
1185  ibool fts_pll_sort = FALSE;
1186  ib_int64_t sig_count = 0;
1187  DBUG_ENTER("row_merge_read_clustered_index");
1188 
1189  ut_ad((old_table == new_table) == !col_map);
1190  ut_ad(!add_cols || col_map);
1191 
1192  trx->op_info = "reading clustered index";
1193 
1194 #ifdef FTS_INTERNAL_DIAG_PRINT
1195  DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
1196 #endif
1197 
1198  /* Create and initialize memory for record buffers */
1199 
1200  merge_buf = static_cast<row_merge_buf_t**>(
1201  mem_alloc(n_index * sizeof *merge_buf));
1202 
1203  for (ulint i = 0; i < n_index; i++) {
1204  if (index[i]->type & DICT_FTS) {
1205 
1206  /* We are building a FT index, make sure
1207  we have the temporary 'fts_sort_idx' */
1208  ut_a(fts_sort_idx);
1209 
1210  fts_index = index[i];
1211 
1212  merge_buf[i] = row_merge_buf_create(fts_sort_idx);
1213 
1214  add_doc_id = DICT_TF2_FLAG_IS_SET(
1215  new_table, DICT_TF2_FTS_ADD_DOC_ID);
1216 
1217  /* If Doc ID does not exist in the table itself,
1218  fetch the first FTS Doc ID */
1219  if (add_doc_id) {
1221  (dict_table_t*) new_table,
1222  &doc_id);
1223  ut_ad(doc_id > 0);
1224  }
1225 
1226  fts_pll_sort = TRUE;
1227  row_fts_start_psort(psort_info);
1228  fts_parallel_sort_event =
1229  psort_info[0].psort_common->sort_event;
1230  } else {
1231  merge_buf[i] = row_merge_buf_create(index[i]);
1232  }
1233  }
1234 
1235  mtr_start(&mtr);
1236 
1237  /* Find the clustered index and create a persistent cursor
1238  based on that. */
1239 
1240  clust_index = dict_table_get_first_index(old_table);
1241 
1243  true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
1244 
1245  if (old_table != new_table) {
1246  /* The table is being rebuilt. Identify the columns
1247  that were flagged NOT NULL in the new table, so that
1248  we can quickly check that the records in the old table
1249  do not violate the added NOT NULL constraints. */
1250 
1251  nonnull = static_cast<ulint*>(
1252  mem_alloc(dict_table_get_n_cols(new_table)
1253  * sizeof *nonnull));
1254 
1255  for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
1256  if (dict_table_get_nth_col(old_table, i)->prtype
1257  & DATA_NOT_NULL) {
1258  continue;
1259  }
1260 
1261  const ulint j = col_map[i];
1262 
1263  if (j == ULINT_UNDEFINED) {
1264  /* The column was dropped. */
1265  continue;
1266  }
1267 
1268  if (dict_table_get_nth_col(new_table, j)->prtype
1269  & DATA_NOT_NULL) {
1270  nonnull[n_nonnull++] = j;
1271  }
1272  }
1273 
1274  if (!n_nonnull) {
1275  mem_free(nonnull);
1276  nonnull = NULL;
1277  }
1278  }
1279 
1280  row_heap = mem_heap_create(sizeof(mrec_buf_t));
1281 
1282  /* Scan the clustered index. */
1283  for (;;) {
1284  const rec_t* rec;
1285  ulint* offsets;
1286  const dtuple_t* row;
1287  row_ext_t* ext;
1288  page_cur_t* cur = btr_pcur_get_page_cur(&pcur);
1289 
1290  page_cur_move_to_next(cur);
1291 
1292  if (page_cur_is_after_last(cur)) {
1293  if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
1294  err = DB_INTERRUPTED;
1295  trx->error_key_num = 0;
1296  goto func_exit;
1297  }
1298 
1299  if (online && old_table != new_table) {
1300  err = row_log_table_get_error(clust_index);
1301  if (err != DB_SUCCESS) {
1302  trx->error_key_num = 0;
1303  goto func_exit;
1304  }
1305  }
1306 #ifdef DBUG_OFF
1307 # define dbug_run_purge false
1308 #else /* DBUG_OFF */
1309  bool dbug_run_purge = false;
1310 #endif /* DBUG_OFF */
1311  DBUG_EXECUTE_IF(
1312  "ib_purge_on_create_index_page_switch",
1313  dbug_run_purge = true;);
1314 
1315  if (dbug_run_purge
1317  dict_index_get_lock(clust_index))) {
1318  /* There are waiters on the clustered
1319  index tree lock, likely the purge
1320  thread. Store and restore the cursor
1321  position, and yield so that scanning a
1322  large table will not starve other
1323  threads. */
1324 
1325  /* Store the cursor position on the last user
1326  record on the page. */
1328  /* Leaf pages must never be empty, unless
1329  this is the only page in the index tree. */
1332  btr_pcur_get_block(&pcur))
1333  == clust_index->page);
1334 
1335  btr_pcur_store_position(&pcur, &mtr);
1336  mtr_commit(&mtr);
1337 
1338  if (dbug_run_purge) {
1339  /* This is for testing
1340  purposes only (see
1341  DBUG_EXECUTE_IF above). We
1342  signal the purge thread and
1343  hope that the purge batch will
1344  complete before we execute
1345  btr_pcur_restore_position(). */
1346  trx_purge_run();
1347  os_thread_sleep(1000000);
1348  }
1349 
1350  /* Give the waiters a chance to proceed. */
1351  os_thread_yield();
1352 
1353  mtr_start(&mtr);
1354  /* Restore position on the record, or its
1355  predecessor if the record was purged
1356  meanwhile. */
1357  btr_pcur_restore_position(
1358  BTR_SEARCH_LEAF, &pcur, &mtr);
1359  /* Move to the successor of the
1360  original record. */
1362  &pcur, &mtr)) {
1363 end_of_index:
1364  row = NULL;
1365  mtr_commit(&mtr);
1366  mem_heap_free(row_heap);
1367  if (nonnull) {
1368  mem_free(nonnull);
1369  }
1370  goto write_buffers;
1371  }
1372  } else {
1373  ulint next_page_no;
1374  buf_block_t* block;
1375 
1376  next_page_no = btr_page_get_next(
1377  page_cur_get_page(cur), &mtr);
1378 
1379  if (next_page_no == FIL_NULL) {
1380  goto end_of_index;
1381  }
1382 
1383  block = page_cur_get_block(cur);
1384  block = btr_block_get(
1385  buf_block_get_space(block),
1386  buf_block_get_zip_size(block),
1387  next_page_no, BTR_SEARCH_LEAF,
1388  clust_index, &mtr);
1389 
1390  btr_leaf_page_release(page_cur_get_block(cur),
1391  BTR_SEARCH_LEAF, &mtr);
1392  page_cur_set_before_first(block, cur);
1393  page_cur_move_to_next(cur);
1394 
1396  }
1397  }
1398 
1399  rec = page_cur_get_rec(cur);
1400 
1401  offsets = rec_get_offsets(rec, clust_index, NULL,
1402  ULINT_UNDEFINED, &row_heap);
1403 
1404  if (online) {
1405  /* Perform a REPEATABLE READ.
1406 
1407  When rebuilding the table online,
1408  row_log_table_apply() must not see a newer
1409  state of the table when applying the log.
1410  This is mainly to prevent false duplicate key
1411  errors, because the log will identify records
1412  by the PRIMARY KEY, and also to prevent unsafe
1413  BLOB access.
1414 
1415  When creating a secondary index online, this
1416  table scan must not see records that have only
1417  been inserted to the clustered index, but have
1418  not been written to the online_log of
1419  index[]. If we performed READ UNCOMMITTED, it
1420  could happen that the ADD INDEX reaches
1421  ONLINE_INDEX_COMPLETE state between the time
1422  the DML thread has updated the clustered index
1423  but has not yet accessed secondary index. */
1424  ut_ad(trx->read_view);
1425 
1426  if (!read_view_sees_trx_id(
1427  trx->read_view,
1429  rec, clust_index, offsets))) {
1430  rec_t* old_vers;
1431 
1433  rec, &mtr, clust_index, &offsets,
1434  trx->read_view, &row_heap,
1435  row_heap, &old_vers);
1436 
1437  rec = old_vers;
1438 
1439  if (!rec) {
1440  continue;
1441  }
1442  }
1443 
1445  rec,
1446  dict_table_is_comp(old_table))) {
1447  /* This record was deleted in the latest
1448  committed version, or it was deleted and
1449  then reinserted-by-update before purge
1450  kicked in. Skip it. */
1451  continue;
1452  }
1453 
1454  ut_ad(!rec_offs_any_null_extern(rec, offsets));
1455  } else if (rec_get_deleted_flag(
1456  rec, dict_table_is_comp(old_table))) {
1457  /* Skip delete-marked records.
1458 
1459  Skipping delete-marked records will make the
1460  created indexes unuseable for transactions
1461  whose read views were created before the index
1462  creation completed, but preserving the history
1463  would make it tricky to detect duplicate
1464  keys. */
1465  continue;
1466  }
1467 
1468  /* When !online, we are holding a lock on old_table, preventing
1469  any inserts that could have written a record 'stub' before
1470  writing out off-page columns. */
1471  ut_ad(!rec_offs_any_null_extern(rec, offsets));
1472 
1473  /* Build a row based on the clustered index. */
1474 
1475  row = row_build(ROW_COPY_POINTERS, clust_index,
1476  rec, offsets, new_table,
1477  add_cols, col_map, &ext, row_heap);
1478  ut_ad(row);
1479 
1480  for (ulint i = 0; i < n_nonnull; i++) {
1481  const dfield_t* field = &row->fields[nonnull[i]];
1482 
1483  ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
1484 
1485  if (dfield_is_null(field)) {
1486  err = DB_INVALID_NULL;
1487  trx->error_key_num = 0;
1488  goto func_exit;
1489  }
1490  }
1491 
1492  /* Get the next Doc ID */
1493  if (add_doc_id) {
1494  doc_id++;
1495  } else {
1496  doc_id = 0;
1497  }
1498 
1499  if (add_autoinc != ULINT_UNDEFINED) {
1500 
1501  ut_ad(add_autoinc
1502  < dict_table_get_n_user_cols(new_table));
1503 
1504  const dfield_t* dfield;
1505 
1506  dfield = dtuple_get_nth_field(row, add_autoinc);
1507  if (dfield_is_null(dfield)) {
1508  goto write_buffers;
1509  }
1510 
1511  const dtype_t* dtype = dfield_get_type(dfield);
1512  byte* b = static_cast<byte*>(dfield_get_data(dfield));
1513 
1514  if (sequence.eof()) {
1515  err = DB_ERROR;
1516  trx->error_key_num = 0;
1517 
1518  ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
1519  ER_AUTOINC_READ_FAILED, "[NULL]");
1520 
1521  goto func_exit;
1522  }
1523 
1524  ulonglong value = sequence++;
1525 
1526  switch (dtype_get_mtype(dtype)) {
1527  case DATA_INT: {
1528  ibool usign;
1529  ulint len = dfield_get_len(dfield);
1530 
1531  usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
1532  mach_write_ulonglong(b, value, len, usign);
1533 
1534  break;
1535  }
1536 
1537  case DATA_FLOAT:
1539  b, static_cast<float>(value));
1540  break;
1541 
1542  case DATA_DOUBLE:
1544  b, static_cast<double>(value));
1545  break;
1546 
1547  default:
1548  ut_ad(0);
1549  }
1550  }
1551 
1552 write_buffers:
1553  /* Build all entries for all the indexes to be created
1554  in a single scan of the clustered index. */
1555 
1556  for (ulint i = 0; i < n_index; i++) {
1557  row_merge_buf_t* buf = merge_buf[i];
1558  merge_file_t* file = &files[i];
1559  ulint rows_added = 0;
1560 
1561  if (UNIV_LIKELY
1562  (row && (rows_added = row_merge_buf_add(
1563  buf, fts_index, old_table,
1564  psort_info, row, ext, &doc_id)))) {
1565 
1566  /* If we are creating FTS index,
1567  a single row can generate more
1568  records for tokenized word */
1569  file->n_rec += rows_added;
1570  if (doc_id > max_doc_id) {
1571  max_doc_id = doc_id;
1572  }
1573 
1574  continue;
1575  }
1576 
1577  if ((buf->index->type & DICT_FTS)
1578  && (!row || !doc_id)) {
1579  continue;
1580  }
1581 
1582  /* The buffer must be sufficiently large
1583  to hold at least one record. It may only
1584  be empty when we reach the end of the
1585  clustered index. row_merge_buf_add()
1586  must not have been called in this loop. */
1587  ut_ad(buf->n_tuples || row == NULL);
1588 
1589  /* We have enough data tuples to form a block.
1590  Sort them and write to disk. */
1591 
1592  if (buf->n_tuples) {
1593  if (dict_index_is_unique(buf->index)) {
1594  row_merge_dup_t dup = {
1595  buf->index, table, col_map, 0};
1596 
1597  row_merge_buf_sort(buf, &dup);
1598 
1599  if (dup.n_dup) {
1600  err = DB_DUPLICATE_KEY;
1601  trx->error_key_num
1602  = key_numbers[i];
1603  break;
1604  }
1605  } else {
1606  row_merge_buf_sort(buf, NULL);
1607  }
1608  } else if (online && new_table == old_table) {
1609  /* Note the newest transaction that
1610  modified this index when the scan was
1611  completed. We prevent older readers
1612  from accessing this index, to ensure
1613  read consistency. */
1614 
1615  trx_id_t max_trx_id;
1616 
1617  ut_a(row == NULL);
1618  rw_lock_x_lock(
1619  dict_index_get_lock(buf->index));
1622 
1623  max_trx_id = row_log_get_max_trx(buf->index);
1624 
1625  if (max_trx_id > buf->index->trx_id) {
1626  buf->index->trx_id = max_trx_id;
1627  }
1628 
1629  rw_lock_x_unlock(
1630  dict_index_get_lock(buf->index));
1631  }
1632 
1633  row_merge_buf_write(buf, file, block);
1634 
1635  if (!row_merge_write(file->fd, file->offset++,
1636  block)) {
1637  err = DB_OUT_OF_FILE_SPACE;
1638  trx->error_key_num = i;
1639  break;
1640  }
1641 
1642  UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
1643  merge_buf[i] = row_merge_buf_empty(buf);
1644 
1645  if (UNIV_LIKELY(row != NULL)) {
1646  /* Try writing the record again, now
1647  that the buffer has been written out
1648  and emptied. */
1649 
1650  if (UNIV_UNLIKELY
1651  (!(rows_added = row_merge_buf_add(
1652  buf, fts_index, old_table,
1653  psort_info, row, ext,
1654  &doc_id)))) {
1655  /* An empty buffer should have enough
1656  room for at least one record. */
1657  ut_error;
1658  }
1659 
1660  file->n_rec += rows_added;
1661  }
1662  }
1663 
1664  if (row == NULL) {
1665  goto all_done;
1666  }
1667 
1668  if (err != DB_SUCCESS) {
1669  goto func_exit;
1670  }
1671 
1672  mem_heap_empty(row_heap);
1673  }
1674 
1675 func_exit:
1676  mtr_commit(&mtr);
1677  mem_heap_free(row_heap);
1678 
1679  if (nonnull) {
1680  mem_free(nonnull);
1681  }
1682 
1683 all_done:
1684 #ifdef FTS_INTERNAL_DIAG_PRINT
1685  DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
1686 #endif
1687  if (fts_pll_sort) {
1688  bool all_exit = false;
1689  ulint trial_count = 0;
1690  const ulint max_trial_count = 10000;
1691 
1692  /* Tell all children that parent has done scanning */
1693  for (ulint i = 0; i < fts_sort_pll_degree; i++) {
1694  psort_info[i].state = FTS_PARENT_COMPLETE;
1695  }
1696 wait_again:
1697  /* Now wait all children to report back to be completed */
1698  os_event_wait_time_low(fts_parallel_sort_event,
1699  1000000, sig_count);
1700 
1701  for (ulint i = 0; i < fts_sort_pll_degree; i++) {
1702  if (psort_info[i].child_status != FTS_CHILD_COMPLETE
1703  && psort_info[i].child_status != FTS_CHILD_EXITING) {
1704  sig_count = os_event_reset(
1705  fts_parallel_sort_event);
1706  goto wait_again;
1707  }
1708  }
1709 
1710  /* Now all children should complete, wait a bit until
1711  they all finish setting the event, before we free everything.
1712  This has a 10 second timeout */
1713  do {
1714  all_exit = true;
1715 
1716  for (ulint j = 0; j < fts_sort_pll_degree; j++) {
1717  if (psort_info[j].child_status
1718  != FTS_CHILD_EXITING) {
1719  all_exit = false;
1720  os_thread_sleep(1000);
1721  break;
1722  }
1723  }
1724  trial_count++;
1725  } while (!all_exit && trial_count < max_trial_count);
1726 
1727  if (!all_exit) {
1728  ut_ad(0);
1729  ib_logf(IB_LOG_LEVEL_FATAL,
1730  "Not all child sort threads exited"
1731  " when creating FTS index '%s'",
1732  fts_sort_idx->name);
1733  }
1734  }
1735 
1736 #ifdef FTS_INTERNAL_DIAG_PRINT
1737  DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
1738 #endif
1739  for (ulint i = 0; i < n_index; i++) {
1740  row_merge_buf_free(merge_buf[i]);
1741  }
1742 
1743  row_fts_free_pll_merge_buf(psort_info);
1744 
1745  mem_free(merge_buf);
1746 
1747  btr_pcur_close(&pcur);
1748 
1749  /* Update the next Doc ID we used. Table should be locked, so
1750  no concurrent DML */
1751  if (max_doc_id) {
1753  0, new_table, old_table->name, max_doc_id);
1754  }
1755 
1756  trx->op_info = "";
1757 
1758  DBUG_RETURN(err);
1759 }
1760 
1765 #define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \
1766  do { \
1767  b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
1768  &buf[2], b2, \
1769  of->fd, &of->offset, \
1770  mrec##N, offsets##N); \
1771  if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \
1772  goto corrupt; \
1773  } \
1774  b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
1775  &buf[N], b##N, INDEX, \
1776  file->fd, foffs##N, \
1777  &mrec##N, offsets##N); \
1778  if (UNIV_UNLIKELY(!b##N)) { \
1779  if (mrec##N) { \
1780  goto corrupt; \
1781  } \
1782  AT_END; \
1783  } \
1784  } while (0)
1785 
1786 /*************************************************************/
1789 static __attribute__((nonnull, warn_unused_result))
1790 dberr_t
1791 row_merge_blocks(
1792 /*=============*/
1793  const row_merge_dup_t* dup,
1795  const merge_file_t* file,
1797  row_merge_block_t* block,
1798  ulint* foffs0,
1800  ulint* foffs1,
1802  merge_file_t* of)
1803 {
1804  mem_heap_t* heap;
1806  mrec_buf_t* buf;
1808  const byte* b0;
1809  const byte* b1;
1810  byte* b2;
1811  const mrec_t* mrec0;
1812  const mrec_t* mrec1;
1814  ulint* offsets0;/* offsets of mrec0 */
1815  ulint* offsets1;/* offsets of mrec1 */
1816 
1817 #ifdef UNIV_DEBUG
1818  if (row_merge_print_block) {
1819  fprintf(stderr,
1820  "row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
1821  " = fd=%d ofs=%lu\n",
1822  file->fd, (ulong) *foffs0,
1823  file->fd, (ulong) *foffs1,
1824  of->fd, (ulong) of->offset);
1825  }
1826 #endif /* UNIV_DEBUG */
1827 
1828  heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
1829 
1830  /* Write a record and read the next record. Split the output
1831  file in two halves, which can be merged on the following pass. */
1832 
1833  if (!row_merge_read(file->fd, *foffs0, &block[0])
1834  || !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size])) {
1835 corrupt:
1836  mem_heap_free(heap);
1837  return(DB_CORRUPTION);
1838  }
1839 
1840  b0 = &block[0];
1841  b1 = &block[srv_sort_buf_size];
1842  b2 = &block[2 * srv_sort_buf_size];
1843 
1844  b0 = row_merge_read_rec(
1845  &block[0], &buf[0], b0, dup->index,
1846  file->fd, foffs0, &mrec0, offsets0);
1847  b1 = row_merge_read_rec(
1848  &block[srv_sort_buf_size],
1849  &buf[srv_sort_buf_size], b1, dup->index,
1850  file->fd, foffs1, &mrec1, offsets1);
1851  if (UNIV_UNLIKELY(!b0 && mrec0)
1852  || UNIV_UNLIKELY(!b1 && mrec1)) {
1853 
1854  goto corrupt;
1855  }
1856 
1857  while (mrec0 && mrec1) {
1858  switch (cmp_rec_rec_simple(
1859  mrec0, mrec1, offsets0, offsets1,
1860  dup->index, dup->table)) {
1861  case 0:
1862  mem_heap_free(heap);
1863  return(DB_DUPLICATE_KEY);
1864  case -1:
1865  ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
1866  break;
1867  case 1:
1868  ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
1869  break;
1870  default:
1871  ut_error;
1872  }
1873  }
1874 
1875 merged:
1876  if (mrec0) {
1877  /* append all mrec0 to output */
1878  for (;;) {
1879  ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
1880  }
1881  }
1882 done0:
1883  if (mrec1) {
1884  /* append all mrec1 to output */
1885  for (;;) {
1886  ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
1887  }
1888  }
1889 done1:
1890 
1891  mem_heap_free(heap);
1892  b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size],
1893  b2, of->fd, &of->offset);
1894  return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1895 }
1896 
1897 /*************************************************************/
1900 static __attribute__((nonnull, warn_unused_result))
1901 ibool
1902 row_merge_blocks_copy(
1903 /*==================*/
1904  const dict_index_t* index,
1905  const merge_file_t* file,
1906  row_merge_block_t* block,
1907  ulint* foffs0,
1908  merge_file_t* of)
1909 {
1910  mem_heap_t* heap;
1912  mrec_buf_t* buf;
1914  const byte* b0;
1915  byte* b2;
1916  const mrec_t* mrec0;
1917  ulint* offsets0;/* offsets of mrec0 */
1918  ulint* offsets1;/* dummy offsets */
1919 
1920 #ifdef UNIV_DEBUG
1921  if (row_merge_print_block) {
1922  fprintf(stderr,
1923  "row_merge_blocks_copy fd=%d ofs=%lu"
1924  " = fd=%d ofs=%lu\n",
1925  file->fd, (ulong) foffs0,
1926  of->fd, (ulong) of->offset);
1927  }
1928 #endif /* UNIV_DEBUG */
1929 
1930  heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
1931 
1932  /* Write a record and read the next record. Split the output
1933  file in two halves, which can be merged on the following pass. */
1934 
1935  if (!row_merge_read(file->fd, *foffs0, &block[0])) {
1936 corrupt:
1937  mem_heap_free(heap);
1938  return(FALSE);
1939  }
1940 
1941  b0 = &block[0];
1942 
1943  b2 = &block[2 * srv_sort_buf_size];
1944 
1945  b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
1946  file->fd, foffs0, &mrec0, offsets0);
1947  if (UNIV_UNLIKELY(!b0 && mrec0)) {
1948 
1949  goto corrupt;
1950  }
1951 
1952  if (mrec0) {
1953  /* append all mrec0 to output */
1954  for (;;) {
1955  ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
1956  }
1957  }
1958 done0:
1959 
1960  /* The file offset points to the beginning of the last page
1961  that has been read. Update it to point to the next block. */
1962  (*foffs0)++;
1963 
1964  mem_heap_free(heap);
1965  return(row_merge_write_eof(&block[2 * srv_sort_buf_size],
1966  b2, of->fd, &of->offset)
1967  != NULL);
1968 }
1969 
1970 /*************************************************************/
1973 static __attribute__((nonnull))
1974 dberr_t
1975 row_merge(
1976 /*======*/
1977  trx_t* trx,
1978  const row_merge_dup_t* dup,
1980  merge_file_t* file,
1982  row_merge_block_t* block,
1983  int* tmpfd,
1984  ulint* num_run,
1986  ulint* run_offset)
1989 {
1990  ulint foffs0;
1991  ulint foffs1;
1992  dberr_t error;
1993  merge_file_t of;
1994  const ulint ihalf = run_offset[*num_run / 2];
1996  ulint n_run = 0;
1999  UNIV_MEM_ASSERT_W(&block[0], 3 * srv_sort_buf_size);
2000 
2001  ut_ad(ihalf < file->offset);
2002 
2003  of.fd = *tmpfd;
2004  of.offset = 0;
2005  of.n_rec = 0;
2006 
2007 #ifdef POSIX_FADV_SEQUENTIAL
2008  /* The input file will be read sequentially, starting from the
2009  beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL
2010  affects the entire file. Each block will be read exactly once. */
2011  posix_fadvise(file->fd, 0, 0,
2012  POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
2013 #endif /* POSIX_FADV_SEQUENTIAL */
2014 
2015  /* Merge blocks to the output file. */
2016  foffs0 = 0;
2017  foffs1 = ihalf;
2018 
2019  UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
2020 
2021  for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
2022 
2023  if (trx_is_interrupted(trx)) {
2024  return(DB_INTERRUPTED);
2025  }
2026 
2027  /* Remember the offset number for this run */
2028  run_offset[n_run++] = of.offset;
2029 
2030  error = row_merge_blocks(dup, file, block,
2031  &foffs0, &foffs1, &of);
2032 
2033  if (error != DB_SUCCESS) {
2034  return(error);
2035  }
2036 
2037  }
2038 
2039  /* Copy the last blocks, if there are any. */
2040 
2041  while (foffs0 < ihalf) {
2042  if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
2043  return(DB_INTERRUPTED);
2044  }
2045 
2046  /* Remember the offset number for this run */
2047  run_offset[n_run++] = of.offset;
2048 
2049  if (!row_merge_blocks_copy(dup->index, file, block,
2050  &foffs0, &of)) {
2051  return(DB_CORRUPTION);
2052  }
2053  }
2054 
2055  ut_ad(foffs0 == ihalf);
2056 
2057  while (foffs1 < file->offset) {
2058  if (trx_is_interrupted(trx)) {
2059  return(DB_INTERRUPTED);
2060  }
2061 
2062  /* Remember the offset number for this run */
2063  run_offset[n_run++] = of.offset;
2064 
2065  if (!row_merge_blocks_copy(dup->index, file, block,
2066  &foffs1, &of)) {
2067  return(DB_CORRUPTION);
2068  }
2069  }
2070 
2071  ut_ad(foffs1 == file->offset);
2072 
2073  if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
2074  return(DB_CORRUPTION);
2075  }
2076 
2077  ut_ad(n_run <= *num_run);
2078 
2079  *num_run = n_run;
2080 
2081  /* Each run can contain one or more offsets. As merge goes on,
2082  the number of runs (to merge) will reduce until we have one
2083  single run. So the number of runs will always be smaller than
2084  the number of offsets in file */
2085  ut_ad((*num_run) <= file->offset);
2086 
2087  /* The number of offsets in output file is always equal or
2088  smaller than input file */
2089  ut_ad(of.offset <= file->offset);
2090 
2091  /* Swap file descriptors for the next pass. */
2092  *tmpfd = file->fd;
2093  *file = of;
2094 
2095  UNIV_MEM_INVALID(&block[0], 3 * srv_sort_buf_size);
2096 
2097  return(DB_SUCCESS);
2098 }
2099 
2100 /*************************************************************/
2103 UNIV_INTERN
2104 dberr_t
2106 /*===========*/
2107  trx_t* trx,
2108  const row_merge_dup_t* dup,
2110  merge_file_t* file,
2112  row_merge_block_t* block,
2113  int* tmpfd)
2114 {
2115  const ulint half = file->offset / 2;
2116  ulint num_runs;
2117  ulint* run_offset;
2118  dberr_t error = DB_SUCCESS;
2119  DBUG_ENTER("row_merge_sort");
2120 
2121  /* Record the number of merge runs we need to perform */
2122  num_runs = file->offset;
2123 
2124  /* If num_runs are less than 1, nothing to merge */
2125  if (num_runs <= 1) {
2126  DBUG_RETURN(error);
2127  }
2128 
2129  /* "run_offset" records each run's first offset number */
2130  run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
2131 
2132  /* This tells row_merge() where to start for the first round
2133  of merge. */
2134  run_offset[half] = half;
2135 
2136  /* The file should always contain at least one byte (the end
2137  of file marker). Thus, it must be at least one block. */
2138  ut_ad(file->offset > 0);
2139 
2140  /* Merge the runs until we have one big run */
2141  do {
2142  error = row_merge(trx, dup, file, block, tmpfd,
2143  &num_runs, run_offset);
2144 
2145  if (error != DB_SUCCESS) {
2146  break;
2147  }
2148 
2149  UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
2150  } while (num_runs > 1);
2151 
2152  mem_free(run_offset);
2153 
2154  DBUG_RETURN(error);
2155 }
2156 
2157 /*************************************************************/
2159 static __attribute__((nonnull))
2160 void
2161 row_merge_copy_blobs(
2162 /*=================*/
2163  const mrec_t* mrec,
2164  const ulint* offsets,
2165  ulint zip_size,
2166  dtuple_t* tuple,
2167  mem_heap_t* heap)
2168 {
2169  ut_ad(rec_offs_any_extern(offsets));
2170 
2171  for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
2172  ulint len;
2173  const void* data;
2174  dfield_t* field = dtuple_get_nth_field(tuple, i);
2175 
2176  if (!dfield_is_ext(field)) {
2177  continue;
2178  }
2179 
2180  ut_ad(!dfield_is_null(field));
2181 
2182  /* During the creation of a PRIMARY KEY, the table is
2183  X-locked, and we skip copying records that have been
2184  marked for deletion. Therefore, externally stored
2185  columns cannot possibly be freed between the time the
2186  BLOB pointers are read (row_merge_read_clustered_index())
2187  and dereferenced (below). */
2189  mrec, offsets, zip_size, i, &len, heap);
2190  /* Because we have locked the table, any records
2191  written by incomplete transactions must have been
2192  rolled back already. There must not be any incomplete
2193  BLOB columns. */
2194  ut_a(data);
2195 
2196  dfield_set_data(field, data, len);
2197  }
2198 }
2199 
2200 /********************************************************************/
2204 static __attribute__((nonnull, warn_unused_result))
2205 dberr_t
2206 row_merge_insert_index_tuples(
2207 /*==========================*/
2208  trx_id_t trx_id,
2209  dict_index_t* index,
2210  const dict_table_t* old_table,
2211  int fd,
2212  row_merge_block_t* block)
2213 {
2214  const byte* b;
2215  mem_heap_t* heap;
2216  mem_heap_t* tuple_heap;
2217  mem_heap_t* ins_heap;
2218  dberr_t error = DB_SUCCESS;
2219  ulint foffs = 0;
2220  ulint* offsets;
2221  mrec_buf_t* buf;
2222  DBUG_ENTER("row_merge_insert_index_tuples");
2223 
2225  ut_ad(!(index->type & DICT_FTS));
2226  ut_ad(trx_id);
2227 
2228  tuple_heap = mem_heap_create(1000);
2229 
2230  {
2231  ulint i = 1 + REC_OFFS_HEADER_SIZE
2232  + dict_index_get_n_fields(index);
2233  heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
2234  ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
2235  offsets = static_cast<ulint*>(
2236  mem_heap_alloc(heap, i * sizeof *offsets));
2237  offsets[0] = i;
2238  offsets[1] = dict_index_get_n_fields(index);
2239  }
2240 
2241  b = block;
2242 
2243  if (!row_merge_read(fd, foffs, block)) {
2244  error = DB_CORRUPTION;
2245  } else {
2246  buf = static_cast<mrec_buf_t*>(
2247  mem_heap_alloc(heap, sizeof *buf));
2248 
2249  for (;;) {
2250  const mrec_t* mrec;
2251  dtuple_t* dtuple;
2252  ulint n_ext;
2253  big_rec_t* big_rec;
2254  rec_t* rec;
2255  btr_cur_t cursor;
2256  mtr_t mtr;
2257 
2258  b = row_merge_read_rec(block, buf, b, index,
2259  fd, &foffs, &mrec, offsets);
2260  if (UNIV_UNLIKELY(!b)) {
2261  /* End of list, or I/O error */
2262  if (mrec) {
2263  error = DB_CORRUPTION;
2264  }
2265  break;
2266  }
2267 
2268  dict_index_t* old_index
2269  = dict_table_get_first_index(old_table);
2270 
2271  if (dict_index_is_clust(index)
2272  && dict_index_is_online_ddl(old_index)) {
2273  error = row_log_table_get_error(old_index);
2274  if (error != DB_SUCCESS) {
2275  break;
2276  }
2277  }
2278 
2279  dtuple = row_rec_to_index_entry_low(
2280  mrec, index, offsets, &n_ext, tuple_heap);
2281 
2282  if (!n_ext) {
2283  /* There are no externally stored columns. */
2284  } else {
2285  ut_ad(dict_index_is_clust(index));
2286  /* Off-page columns can be fetched safely
2287  when concurrent modifications to the table
2288  are disabled. (Purge can process delete-marked
2289  records, but row_merge_read_clustered_index()
2290  would have skipped them.)
2291 
2292  When concurrent modifications are enabled,
2293  row_merge_read_clustered_index() will
2294  only see rows from transactions that were
2295  committed before the ALTER TABLE started
2296  (REPEATABLE READ).
2297 
2298  Any modifications after the
2299  row_merge_read_clustered_index() scan
2300  will go through row_log_table_apply().
2301  Any modifications to off-page columns
2302  will be tracked by
2303  row_log_table_blob_alloc() and
2304  row_log_table_blob_free(). */
2305  row_merge_copy_blobs(
2306  mrec, offsets,
2307  dict_table_zip_size(old_table),
2308  dtuple, tuple_heap);
2309  }
2310 
2311  ut_ad(dtuple_validate(dtuple));
2312  log_free_check();
2313 
2314  mtr_start(&mtr);
2315  /* Insert after the last user record. */
2316  btr_cur_open_at_index_side(
2317  false, index, BTR_MODIFY_LEAF,
2318  &cursor, 0, &mtr);
2320  page_rec_get_prev(btr_cur_get_rec(&cursor)),
2321  btr_cur_get_block(&cursor),
2322  btr_cur_get_page_cur(&cursor));
2323  cursor.flag = BTR_CUR_BINARY;
2324 #ifdef UNIV_DEBUG
2325  /* Check that the records are inserted in order. */
2326  rec = btr_cur_get_rec(&cursor);
2327 
2328  if (!page_rec_is_infimum(rec)) {
2329  ulint* rec_offsets = rec_get_offsets(
2330  rec, index, offsets,
2331  ULINT_UNDEFINED, &tuple_heap);
2332  ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets)
2333  > 0);
2334  }
2335 #endif /* UNIV_DEBUG */
2336  ulint* ins_offsets = NULL;
2337 
2338  error = btr_cur_optimistic_insert(
2341  &cursor, &ins_offsets, &ins_heap,
2342  dtuple, &rec, &big_rec, 0, NULL, &mtr);
2343 
2344  if (error == DB_FAIL) {
2345  ut_ad(!big_rec);
2346  mtr_commit(&mtr);
2347  mtr_start(&mtr);
2348  btr_cur_open_at_index_side(
2349  false, index, BTR_MODIFY_TREE,
2350  &cursor, 0, &mtr);
2352  page_rec_get_prev(btr_cur_get_rec(
2353  &cursor)),
2354  btr_cur_get_block(&cursor),
2355  btr_cur_get_page_cur(&cursor));
2356 
2361  &cursor, &ins_offsets, &ins_heap,
2362  dtuple, &rec, &big_rec, 0, NULL, &mtr);
2363  }
2364 
2365  if (!dict_index_is_clust(index)) {
2367  btr_cur_get_block(&cursor),
2368  btr_cur_get_page_zip(&cursor),
2369  trx_id, &mtr);
2370  }
2371 
2372  mtr_commit(&mtr);
2373 
2374  if (UNIV_LIKELY_NULL(big_rec)) {
2375  /* If the system crashes at this
2376  point, the clustered index record will
2377  contain a null BLOB pointer. This
2378  should not matter, because the copied
2379  table will be dropped on crash
2380  recovery anyway. */
2381 
2382  ut_ad(dict_index_is_clust(index));
2383  ut_ad(error == DB_SUCCESS);
2384  error = row_ins_index_entry_big_rec(
2385  dtuple, big_rec,
2386  ins_offsets, &ins_heap,
2387  index, NULL, __FILE__, __LINE__);
2389  index, dtuple, big_rec);
2390  }
2391 
2392  if (error != DB_SUCCESS) {
2393  goto err_exit;
2394  }
2395 
2396  mem_heap_empty(tuple_heap);
2397  mem_heap_empty(ins_heap);
2398  }
2399  }
2400 
2401 err_exit:
2402  mem_heap_free(tuple_heap);
2403  mem_heap_free(ins_heap);
2404  mem_heap_free(heap);
2405 
2406  DBUG_RETURN(error);
2407 }
2408 
2409 /*********************************************************************/
2412 UNIV_INTERN
2413 dberr_t
2415 /*=================*/
2416  trx_t* trx,
2417  dict_table_t* table,
2418  enum lock_mode mode)
2419 {
2420  mem_heap_t* heap;
2421  que_thr_t* thr;
2422  dberr_t err;
2423  sel_node_t* node;
2424 
2426  ut_ad(mode == LOCK_X || mode == LOCK_S);
2427 
2428  heap = mem_heap_create(512);
2429 
2430  trx->op_info = "setting table lock for creating or dropping index";
2431 
2432  node = sel_node_create(heap);
2433  thr = pars_complete_graph_for_exec(node, trx, heap);
2434  thr->graph->state = QUE_FORK_ACTIVE;
2435 
2436  /* We use the select query graph as the dummy graph needed
2437  in the lock module call */
2438 
2439  thr = static_cast<que_thr_t*>(
2441  static_cast<que_fork_t*>(que_node_get_parent(thr))));
2442 
2444 
2445 run_again:
2446  thr->run_node = thr;
2447  thr->prev_node = thr->common.parent;
2448 
2449  err = lock_table(0, table, mode, thr);
2450 
2451  trx->error_state = err;
2452 
2453  if (UNIV_LIKELY(err == DB_SUCCESS)) {
2455  } else {
2457 
2458  if (err != DB_QUE_THR_SUSPENDED) {
2459  bool was_lock_wait;
2460 
2461  was_lock_wait = row_mysql_handle_errors(
2462  &err, trx, thr, NULL);
2463 
2464  if (was_lock_wait) {
2465  goto run_again;
2466  }
2467  } else {
2468  que_thr_t* run_thr;
2469  que_node_t* parent;
2470 
2471  parent = que_node_get_parent(thr);
2472 
2473  run_thr = que_fork_start_command(
2474  static_cast<que_fork_t*>(parent));
2475 
2476  ut_a(run_thr == thr);
2477 
2478  /* There was a lock wait but the thread was not
2479  in a ready to run or running state. */
2480  trx->error_state = DB_LOCK_WAIT;
2481 
2482  goto run_again;
2483  }
2484  }
2485 
2486  que_graph_free(thr->graph);
2487  trx->op_info = "";
2488 
2489  return(err);
2490 }
2491 
2492 /*********************************************************************/
2496 static
2497 void
2498 row_merge_drop_index_dict(
2499 /*======================*/
2500  trx_t* trx,
2501  index_id_t index_id)
2502 {
2503  static const char sql[] =
2504  "PROCEDURE DROP_INDEX_PROC () IS\n"
2505  "BEGIN\n"
2506  "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
2507  "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
2508  "END;\n";
2509  dberr_t error;
2510  pars_info_t* info;
2511 
2513  ut_ad(mutex_own(&dict_sys->mutex));
2514  ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
2516 #ifdef UNIV_SYNC_DEBUG
2517  ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
2518 #endif /* UNIV_SYNC_DEBUG */
2519 
2520  info = pars_info_create();
2521  pars_info_add_ull_literal(info, "indexid", index_id);
2522  trx->op_info = "dropping index from dictionary";
2523  error = que_eval_sql(info, sql, FALSE, trx);
2524 
2525  if (error != DB_SUCCESS) {
2526  /* Even though we ensure that DDL transactions are WAIT
2527  and DEADLOCK free, we could encounter other errors e.g.,
2528  DB_TOO_MANY_CONCURRENT_TRXS. */
2529  trx->error_state = DB_SUCCESS;
2530 
2531  ut_print_timestamp(stderr);
2532  fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict "
2533  "failed with error code: %u.\n", (unsigned) error);
2534  }
2535 
2536  trx->op_info = "";
2537 }
2538 
2539 /*********************************************************************/
2543 UNIV_INTERN
2544 void
2546 /*========================*/
2547  trx_t* trx,
2548  table_id_t table_id)
2549 {
2550  static const char sql[] =
2551  "PROCEDURE DROP_INDEXES_PROC () IS\n"
2552  "ixid CHAR;\n"
2553  "found INT;\n"
2554 
2555  "DECLARE CURSOR index_cur IS\n"
2556  " SELECT ID FROM SYS_INDEXES\n"
2557  " WHERE TABLE_ID=:tableid AND\n"
2558  " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
2559  "FOR UPDATE;\n"
2560 
2561  "BEGIN\n"
2562  "found := 1;\n"
2563  "OPEN index_cur;\n"
2564  "WHILE found = 1 LOOP\n"
2565  " FETCH index_cur INTO ixid;\n"
2566  " IF (SQL % NOTFOUND) THEN\n"
2567  " found := 0;\n"
2568  " ELSE\n"
2569  " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
2570  " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
2571  " END IF;\n"
2572  "END LOOP;\n"
2573  "CLOSE index_cur;\n"
2574 
2575  "END;\n";
2576  dberr_t error;
2577  pars_info_t* info;
2578 
2580  ut_ad(mutex_own(&dict_sys->mutex));
2581  ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
2583 #ifdef UNIV_SYNC_DEBUG
2584  ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
2585 #endif /* UNIV_SYNC_DEBUG */
2586 
2587  /* It is possible that table->n_ref_count > 1 when
2588  locked=TRUE. In this case, all code that should have an open
2589  handle to the table be waiting for the next statement to execute,
2590  or waiting for a meta-data lock.
2591 
2592  A concurrent purge will be prevented by dict_operation_lock. */
2593 
2594  info = pars_info_create();
2595  pars_info_add_ull_literal(info, "tableid", table_id);
2596  trx->op_info = "dropping indexes";
2597  error = que_eval_sql(info, sql, FALSE, trx);
2598 
2599  if (error != DB_SUCCESS) {
2600  /* Even though we ensure that DDL transactions are WAIT
2601  and DEADLOCK free, we could encounter other errors e.g.,
2602  DB_TOO_MANY_CONCURRENT_TRXS. */
2603  trx->error_state = DB_SUCCESS;
2604 
2605  ut_print_timestamp(stderr);
2606  fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict "
2607  "failed with error code: %u.\n", (unsigned) error);
2608  }
2609 
2610  trx->op_info = "";
2611 }
2612 
2613 /*********************************************************************/
2617 UNIV_INTERN
2618 void
2620 /*===================*/
2621  trx_t* trx,
2622  dict_table_t* table,
2623  ibool locked)
2625 {
2627  dict_index_t* next_index;
2628 
2630  ut_ad(mutex_own(&dict_sys->mutex));
2631  ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
2633 #ifdef UNIV_SYNC_DEBUG
2634  ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
2635 #endif /* UNIV_SYNC_DEBUG */
2636 
2637  index = dict_table_get_first_index(table);
2638  ut_ad(dict_index_is_clust(index));
2640 
2641  /* the caller should have an open handle to the table */
2642  ut_ad(table->n_ref_count >= 1);
2643 
2644  /* It is possible that table->n_ref_count > 1 when
2645  locked=TRUE. In this case, all code that should have an open
2646  handle to the table be waiting for the next statement to execute,
2647  or waiting for a meta-data lock.
2648 
2649  A concurrent purge will be prevented by dict_operation_lock. */
2650 
2651  if (!locked && table->n_ref_count > 1) {
2652  /* We will have to drop the indexes later, when the
2653  table is guaranteed to be no longer in use. Mark the
2654  indexes as incomplete and corrupted, so that other
2655  threads will stop using them. Let dict_table_close()
2656  or crash recovery or the next invocation of
2657  prepare_inplace_alter_table() take care of dropping
2658  the indexes. */
2659 
2660  while ((index = dict_table_get_next_index(index)) != NULL) {
2661  ut_ad(!dict_index_is_clust(index));
2662 
2663  switch (dict_index_get_online_status(index)) {
2665  continue;
2666  case ONLINE_INDEX_COMPLETE:
2667  if (*index->name != TEMP_INDEX_PREFIX) {
2668  /* Do nothing to already
2669  published indexes. */
2670  } else if (index->type & DICT_FTS) {
2671  /* Drop a completed FULLTEXT
2672  index, due to a timeout during
2673  MDL upgrade for
2674  commit_inplace_alter_table().
2675  Because only concurrent reads
2676  are allowed (and they are not
2677  seeing this index yet) we
2678  are safe to drop the index. */
2680  indexes, index);
2681  /* At least there should be
2682  the clustered index before
2683  this one. */
2684  ut_ad(prev);
2685  ut_a(table->fts);
2686  fts_drop_index(table, index, trx);
2687  /* Since
2688  INNOBASE_SHARE::idx_trans_tbl
2689  is shared between all open
2690  ha_innobase handles to this
2691  table, no thread should be
2692  accessing this dict_index_t
2693  object. Also, we should be
2694  holding LOCK=SHARED MDL on the
2695  table even after the MDL
2696  upgrade timeout. */
2697 
2698  /* We can remove a DICT_FTS
2699  index from the cache, because
2700  we do not allow ADD FULLTEXT INDEX
2701  with LOCK=NONE. If we allowed that,
2702  we should exclude FTS entries from
2703  prebuilt->ins_node->entry_list
2704  in ins_node_create_entry_list(). */
2706  table, index);
2707  index = prev;
2708  } else {
2709  rw_lock_x_lock(
2710  dict_index_get_lock(index));
2712  index, ONLINE_INDEX_ABORTED);
2713  index->type |= DICT_CORRUPT;
2714  table->drop_aborted = TRUE;
2715  goto drop_aborted;
2716  }
2717  continue;
2718  case ONLINE_INDEX_CREATION:
2719  rw_lock_x_lock(dict_index_get_lock(index));
2720  ut_ad(*index->name == TEMP_INDEX_PREFIX);
2721  row_log_abort_sec(index);
2722  drop_aborted:
2723  rw_lock_x_unlock(dict_index_get_lock(index));
2724 
2725  DEBUG_SYNC_C("merge_drop_index_after_abort");
2726  /* covered by dict_sys->mutex */
2727  MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
2728  /* fall through */
2729  case ONLINE_INDEX_ABORTED:
2730  /* Drop the index tree from the
2731  data dictionary and free it from
2732  the tablespace, but keep the object
2733  in the data dictionary cache. */
2734  row_merge_drop_index_dict(trx, index->id);
2735  rw_lock_x_lock(dict_index_get_lock(index));
2738  rw_lock_x_unlock(dict_index_get_lock(index));
2739  table->drop_aborted = TRUE;
2740  continue;
2741  }
2742  ut_error;
2743  }
2744 
2745  return;
2746  }
2747 
2748  row_merge_drop_indexes_dict(trx, table->id);
2749 
2750  /* Invalidate all row_prebuilt_t::ins_graph that are referring
2751  to this table. That is, force row_get_prebuilt_insert_row() to
2752  rebuild prebuilt->ins_node->entry_list). */
2753  ut_ad(table->def_trx_id <= trx->id);
2754  table->def_trx_id = trx->id;
2755 
2756  next_index = dict_table_get_next_index(index);
2757 
2758  while ((index = next_index) != NULL) {
2759  /* read the next pointer before freeing the index */
2760  next_index = dict_table_get_next_index(index);
2761 
2762  ut_ad(!dict_index_is_clust(index));
2763 
2764  if (*index->name == TEMP_INDEX_PREFIX) {
2765  /* If it is FTS index, drop from table->fts
2766  and also drop its auxiliary tables */
2767  if (index->type & DICT_FTS) {
2768  ut_a(table->fts);
2769  fts_drop_index(table, index, trx);
2770  }
2771 
2772  switch (dict_index_get_online_status(index)) {
2773  case ONLINE_INDEX_CREATION:
2774  /* This state should only be possible
2775  when prepare_inplace_alter_table() fails
2776  after invoking row_merge_create_index().
2777  In inplace_alter_table(),
2778  row_merge_build_indexes()
2779  should never leave the index in this state.
2780  It would invoke row_log_abort_sec() on
2781  failure. */
2782  case ONLINE_INDEX_COMPLETE:
2783  /* In these cases, we are able to drop
2784  the index straight. The DROP INDEX was
2785  never deferred. */
2786  break;
2787  case ONLINE_INDEX_ABORTED:
2789  /* covered by dict_sys->mutex */
2790  MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
2791  }
2792 
2793  dict_index_remove_from_cache(table, index);
2794  }
2795  }
2796 
2797  table->drop_aborted = FALSE;
2798  ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
2799 }
2800 
2801 /*********************************************************************/
2803 UNIV_INTERN
2804 void
2806 /*=============================*/
2807 {
2808  static const char sql[] =
2809  "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
2810  "ixid CHAR;\n"
2811  "found INT;\n"
2812 
2813  "DECLARE CURSOR index_cur IS\n"
2814  " SELECT ID FROM SYS_INDEXES\n"
2815  " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
2816  "FOR UPDATE;\n"
2817 
2818  "BEGIN\n"
2819  "found := 1;\n"
2820  "OPEN index_cur;\n"
2821  "WHILE found = 1 LOOP\n"
2822  " FETCH index_cur INTO ixid;\n"
2823  " IF (SQL % NOTFOUND) THEN\n"
2824  " found := 0;\n"
2825  " ELSE\n"
2826  " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
2827  " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
2828  " END IF;\n"
2829  "END LOOP;\n"
2830  "CLOSE index_cur;\n"
2831  "END;\n";
2832  trx_t* trx;
2833  dberr_t error;
2834 
2835  /* Load the table definitions that contain partially defined
2836  indexes, so that the data dictionary information can be checked
2837  when accessing the tablename.ibd files. */
2839  trx->op_info = "dropping partially created indexes";
2840  row_mysql_lock_data_dictionary(trx);
2841  /* Ensure that this transaction will be rolled back and locks
2842  will be released, if the server gets killed before the commit
2843  gets written to the redo log. */
2845 
2846  trx->op_info = "dropping indexes";
2847  error = que_eval_sql(NULL, sql, FALSE, trx);
2848 
2849  if (error != DB_SUCCESS) {
2850  /* Even though we ensure that DDL transactions are WAIT
2851  and DEADLOCK free, we could encounter other errors e.g.,
2852  DB_TOO_MANY_CONCURRENT_TRXS. */
2853  trx->error_state = DB_SUCCESS;
2854 
2855  ut_print_timestamp(stderr);
2856  fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes "
2857  "failed with error code: %u.\n", (unsigned) error);
2858  }
2859 
2860  trx_commit_for_mysql(trx);
2863 }
2864 
2865 /*********************************************************************/
2869 UNIV_INTERN
2870 int
2872 /*===========================*/
2873 {
2874  int fd;
2875 #ifdef UNIV_PFS_IO
2876  /* This temp file open does not go through normal
2877  file APIs, add instrumentation to register with
2878  performance schema */
2879  struct PSI_file_locker* locker = NULL;
2880  PSI_file_locker_state state;
2881  register_pfs_file_open_begin(&state, locker, innodb_file_temp_key,
2882  PSI_FILE_OPEN,
2883  "Innodb Merge Temp File",
2884  __FILE__, __LINE__);
2885 #endif
2886  fd = innobase_mysql_tmpfile();
2887 #ifdef UNIV_PFS_IO
2888  register_pfs_file_open_end(locker, fd);
2889 #endif
2890 
2891  if (fd < 0) {
2892  ib_logf(IB_LOG_LEVEL_ERROR,
2893  "Cannot create temporary merge file");
2894  return (-1);
2895  }
2896  return(fd);
2897 }
2898 
2899 /*********************************************************************/
2902 UNIV_INTERN
2903 int
2905 /*==================*/
2906  merge_file_t* merge_file)
2907 {
2908  merge_file->fd = row_merge_file_create_low();
2909  merge_file->offset = 0;
2910  merge_file->n_rec = 0;
2911 
2912  if (merge_file->fd >= 0) {
2913  if (srv_disable_sort_file_cache) {
2914  os_file_set_nocache(merge_file->fd,
2915  "row0merge.cc", "sort");
2916  }
2917  }
2918  return(merge_file->fd);
2919 }
2920 
2921 /*********************************************************************/
2924 UNIV_INTERN
2925 void
2927 /*=======================*/
2928  int fd)
2929 {
2930 #ifdef UNIV_PFS_IO
2931  struct PSI_file_locker* locker = NULL;
2932  PSI_file_locker_state state;
2933  register_pfs_file_io_begin(&state, locker,
2934  fd, 0, PSI_FILE_CLOSE,
2935  __FILE__, __LINE__);
2936 #endif
2937  if (fd >= 0) {
2938  close(fd);
2939  }
2940 #ifdef UNIV_PFS_IO
2941  register_pfs_file_io_end(locker, 0);
2942 #endif
2943 }
2944 /*********************************************************************/
2946 UNIV_INTERN
2947 void
2949 /*===================*/
2950  merge_file_t* merge_file)
2951 {
2953 
2954  if (merge_file->fd != -1) {
2955  row_merge_file_destroy_low(merge_file->fd);
2956  merge_file->fd = -1;
2957  }
2958 }
2959 
2960 /*********************************************************************/
2965 UNIV_INTERN
2966 dberr_t
2968 /*==========================*/
2969  trx_t* trx,
2970  table_id_t table_id,
2971  index_id_t index_id)
2972 {
2973  dberr_t err = DB_SUCCESS;
2974  pars_info_t* info = pars_info_create();
2975 
2976  /* We use the private SQL parser of Innobase to generate the
2977  query graphs needed in renaming indexes. */
2978 
2979  static const char rename_index[] =
2980  "PROCEDURE RENAME_INDEX_PROC () IS\n"
2981  "BEGIN\n"
2982  "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
2983  "WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
2984  "END;\n";
2985 
2986  ut_ad(trx);
2987  ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2989 
2990  trx->op_info = "renaming index to add";
2991 
2992  pars_info_add_ull_literal(info, "tableid", table_id);
2993  pars_info_add_ull_literal(info, "indexid", index_id);
2994 
2995  err = que_eval_sql(info, rename_index, FALSE, trx);
2996 
2997  if (err != DB_SUCCESS) {
2998  /* Even though we ensure that DDL transactions are WAIT
2999  and DEADLOCK free, we could encounter other errors e.g.,
3000  DB_TOO_MANY_CONCURRENT_TRXS. */
3001  trx->error_state = DB_SUCCESS;
3002 
3003  ut_print_timestamp(stderr);
3004  fprintf(stderr,
3005  " InnoDB: Error: row_merge_rename_index_to_add "
3006  "failed with error code: %u.\n", (unsigned) err);
3007  }
3008 
3009  trx->op_info = "";
3010 
3011  return(err);
3012 }
3013 
3014 /*********************************************************************/
3019 UNIV_INTERN
3020 dberr_t
3022 /*===========================*/
3023  trx_t* trx,
3024  table_id_t table_id,
3025  index_id_t index_id)
3026 {
3027  dberr_t err;
3028  pars_info_t* info = pars_info_create();
3029 
3031 
3032  /* We use the private SQL parser of Innobase to generate the
3033  query graphs needed in renaming indexes. */
3034 
3035  static const char rename_index[] =
3036  "PROCEDURE RENAME_INDEX_PROC () IS\n"
3037  "BEGIN\n"
3038  "UPDATE SYS_INDEXES SET NAME=CONCAT('"
3039  TEMP_INDEX_PREFIX_STR "',NAME)\n"
3040  "WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
3041  "END;\n";
3042 
3043  ut_ad(trx);
3044  ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
3046 
3047  trx->op_info = "renaming index to drop";
3048 
3049  pars_info_add_ull_literal(info, "tableid", table_id);
3050  pars_info_add_ull_literal(info, "indexid", index_id);
3051 
3052  err = que_eval_sql(info, rename_index, FALSE, trx);
3053 
3054  if (err != DB_SUCCESS) {
3055  /* Even though we ensure that DDL transactions are WAIT
3056  and DEADLOCK free, we could encounter other errors e.g.,
3057  DB_TOO_MANY_CONCURRENT_TRXS. */
3058  trx->error_state = DB_SUCCESS;
3059 
3060  ut_print_timestamp(stderr);
3061  fprintf(stderr,
3062  " InnoDB: Error: row_merge_rename_index_to_drop "
3063  "failed with error code: %u.\n", (unsigned) err);
3064  }
3065 
3066  trx->op_info = "";
3067 
3068  return(err);
3069 }
3070 
3071 /*********************************************************************/
3076 UNIV_INTERN
3077 char*
3079 /*==================*/
3080  dict_table_t* table,
3081  const char* new_name)
3082 {
3083  char* new_path;
3084  char* old_path;
3085 
3086  ut_ad(table->space != TRX_SYS_SPACE);
3087 
3088  old_path = fil_space_get_first_path(table->space);
3089  ut_a(old_path);
3090 
3091  new_path = os_file_make_new_pathname(old_path, new_name);
3092 
3093  mem_free(old_path);
3094 
3095  return(new_path);
3096 }
3097 
3098 /*********************************************************************/
3103 UNIV_INTERN
3104 dberr_t
3106 /*=========================*/
3107  dict_table_t* old_table,
3109  dict_table_t* new_table,
3111  const char* tmp_name,
3112  trx_t* trx)
3113 {
3114  dberr_t err = DB_ERROR;
3115  pars_info_t* info;
3116 
3118  ut_ad(old_table != new_table);
3119  ut_ad(mutex_own(&dict_sys->mutex));
3120  ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
3123 
3124  trx->op_info = "renaming tables";
3125 
3126  /* We use the private SQL parser of Innobase to generate the query
3127  graphs needed in updating the dictionary data in system tables. */
3128 
3129  info = pars_info_create();
3130 
3131  pars_info_add_str_literal(info, "new_name", new_table->name);
3132  pars_info_add_str_literal(info, "old_name", old_table->name);
3133  pars_info_add_str_literal(info, "tmp_name", tmp_name);
3134 
3135  err = que_eval_sql(info,
3136  "PROCEDURE RENAME_TABLES () IS\n"
3137  "BEGIN\n"
3138  "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
3139  " WHERE NAME = :old_name;\n"
3140  "UPDATE SYS_TABLES SET NAME = :old_name\n"
3141  " WHERE NAME = :new_name;\n"
3142  "END;\n", FALSE, trx);
3143 
3144  /* Update SYS_TABLESPACES and SYS_DATAFILES if the old
3145  table is in a non-system tablespace where space > 0. */
3146  if (err == DB_SUCCESS
3147  && old_table->space != TRX_SYS_SPACE
3148  && !old_table->ibd_file_missing) {
3149  /* Make pathname to update SYS_DATAFILES. */
3150  char* tmp_path = row_make_new_pathname(old_table, tmp_name);
3151 
3152  info = pars_info_create();
3153 
3154  pars_info_add_str_literal(info, "tmp_name", tmp_name);
3155  pars_info_add_str_literal(info, "tmp_path", tmp_path);
3156  pars_info_add_int4_literal(info, "old_space",
3157  (lint) old_table->space);
3158 
3159  err = que_eval_sql(info,
3160  "PROCEDURE RENAME_OLD_SPACE () IS\n"
3161  "BEGIN\n"
3162  "UPDATE SYS_TABLESPACES"
3163  " SET NAME = :tmp_name\n"
3164  " WHERE SPACE = :old_space;\n"
3165  "UPDATE SYS_DATAFILES"
3166  " SET PATH = :tmp_path\n"
3167  " WHERE SPACE = :old_space;\n"
3168  "END;\n", FALSE, trx);
3169 
3170  mem_free(tmp_path);
3171  }
3172 
3173  /* Update SYS_TABLESPACES and SYS_DATAFILES if the new
3174  table is in a non-system tablespace where space > 0. */
3175  if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) {
3176  /* Make pathname to update SYS_DATAFILES. */
3177  char* old_path = row_make_new_pathname(
3178  new_table, old_table->name);
3179 
3180  info = pars_info_create();
3181 
3182  pars_info_add_str_literal(info, "old_name", old_table->name);
3183  pars_info_add_str_literal(info, "old_path", old_path);
3184  pars_info_add_int4_literal(info, "new_space",
3185  (lint) new_table->space);
3186 
3187  err = que_eval_sql(info,
3188  "PROCEDURE RENAME_NEW_SPACE () IS\n"
3189  "BEGIN\n"
3190  "UPDATE SYS_TABLESPACES"
3191  " SET NAME = :old_name\n"
3192  " WHERE SPACE = :new_space;\n"
3193  "UPDATE SYS_DATAFILES"
3194  " SET PATH = :old_path\n"
3195  " WHERE SPACE = :new_space;\n"
3196  "END;\n", FALSE, trx);
3197 
3198  mem_free(old_path);
3199  }
3200 
3201  if (err == DB_SUCCESS && dict_table_is_discarded(new_table)) {
3203  trx, new_table->id, true, true);
3204  }
3205 
3206  trx->op_info = "";
3207 
3208  return(err);
3209 }
3210 
3211 /*********************************************************************/
3214 static __attribute__((nonnull, warn_unused_result))
3215 dberr_t
3216 row_merge_create_index_graph(
3217 /*=========================*/
3218  trx_t* trx,
3219  dict_table_t* table,
3220  dict_index_t* index)
3221 {
3222  ind_node_t* node;
3223  mem_heap_t* heap;
3224  que_thr_t* thr;
3225  dberr_t err;
3226 
3227  ut_ad(trx);
3228  ut_ad(table);
3229  ut_ad(index);
3230 
3231  heap = mem_heap_create(512);
3232 
3233  index->table = table;
3234  node = ind_create_graph_create(index, heap, false);
3235  thr = pars_complete_graph_for_exec(node, trx, heap);
3236 
3238  static_cast<que_fork_t*>(que_node_get_parent(thr))));
3239 
3240  que_run_threads(thr);
3241 
3242  err = trx->error_state;
3243 
3245 
3246  return(err);
3247 }
3248 
3249 /*********************************************************************/
3252 UNIV_INTERN
3253 dict_index_t*
3255 /*===================*/
3256  trx_t* trx,
3257  dict_table_t* table,
3258  const index_def_t* index_def)
3260 {
3262  dberr_t err;
3263  ulint n_fields = index_def->n_fields;
3264  ulint i;
3265 
3267 
3268  /* Create the index prototype, using the passed in def, this is not
3269  a persistent operation. We pass 0 as the space id, and determine at
3270  a lower level the space id where to store the table. */
3271 
3272  index = dict_mem_index_create(table->name, index_def->name,
3273  0, index_def->ind_type, n_fields);
3274 
3275  ut_a(index);
3276 
3277  for (i = 0; i < n_fields; i++) {
3278  index_field_t* ifield = &index_def->fields[i];
3279 
3281  index, dict_table_get_col_name(table, ifield->col_no),
3282  ifield->prefix_len);
3283  }
3284 
3285  /* Add the index to SYS_INDEXES, using the index prototype. */
3286  err = row_merge_create_index_graph(trx, table, index);
3287 
3288  if (err == DB_SUCCESS) {
3289 
3290  index = dict_table_get_index_on_name(table, index_def->name);
3291 
3292  ut_a(index);
3293 
3294  /* Note the id of the transaction that created this
3295  index, we use it to restrict readers from accessing
3296  this index, to ensure read consistency. */
3297  ut_ad(index->trx_id == trx->id);
3298  } else {
3299  index = NULL;
3300  }
3301 
3302  return(index);
3303 }
3304 
3305 /*********************************************************************/
3307 UNIV_INTERN
3308 ibool
3310 /*======================*/
3311  const trx_t* trx,
3312  const dict_index_t* index)
3313 {
3314  if (!dict_index_is_clust(index)
3315  && dict_index_is_online_ddl(index)) {
3316  /* Indexes that are being created are not useable. */
3317  return(FALSE);
3318  }
3319 
3320  return(!dict_index_is_corrupted(index)
3321  && (dict_table_is_temporary(index->table)
3322  || !trx->read_view
3323  || read_view_sees_trx_id(trx->read_view, index->trx_id)));
3324 }
3325 
3326 /*********************************************************************/
3332 UNIV_INTERN
3333 dberr_t
3335 /*=================*/
3336  trx_t* trx,
3337  dict_table_t* table)
3338 {
3340 
3341  /* There must be no open transactions on the table. */
3342  ut_a(table->n_ref_count == 0);
3343 
3344  return(row_drop_table_for_mysql(table->name, trx, false, false));
3345 }
3346 
3347 /*********************************************************************/
3352 UNIV_INTERN
3353 dberr_t
3355 /*====================*/
3356  trx_t* trx,
3357  dict_table_t* old_table,
3359  dict_table_t* new_table,
3362  bool online,
3364  dict_index_t** indexes,
3365  const ulint* key_numbers,
3366  ulint n_indexes,
3367  struct TABLE* table,
3370  const dtuple_t* add_cols,
3372  const ulint* col_map,
3375  ulint add_autoinc,
3378  ib_sequence_t& sequence)
3380 {
3381  merge_file_t* merge_files;
3383  ulint block_size;
3384  ulint i;
3385  ulint j;
3386  dberr_t error;
3387  int tmpfd = -1;
3388  dict_index_t* fts_sort_idx = NULL;
3389  fts_psort_t* psort_info = NULL;
3390  fts_psort_t* merge_info = NULL;
3391  ib_int64_t sig_count = 0;
3392  DBUG_ENTER("row_merge_build_indexes");
3393 
3395  ut_ad((old_table == new_table) == !col_map);
3396  ut_ad(!add_cols || col_map);
3397 
3398  /* Allocate memory for merge file data structure and initialize
3399  fields */
3400 
3401  block_size = 3 * srv_sort_buf_size;
3402  block = static_cast<row_merge_block_t*>(
3403  os_mem_alloc_large(&block_size));
3404 
3405  if (block == NULL) {
3406  DBUG_RETURN(DB_OUT_OF_MEMORY);
3407  }
3408 
3409  trx_start_if_not_started_xa(trx);
3410 
3411  merge_files = static_cast<merge_file_t*>(
3412  mem_alloc(n_indexes * sizeof *merge_files));
3413 
3414  /* Initialize all the merge file descriptors, so that we
3415  don't call row_merge_file_destroy() on uninitialized
3416  merge file descriptor */
3417 
3418  for (i = 0; i < n_indexes; i++) {
3419  merge_files[i].fd = -1;
3420  }
3421 
3422  for (i = 0; i < n_indexes; i++) {
3423  if (row_merge_file_create(&merge_files[i]) < 0) {
3424  error = DB_OUT_OF_MEMORY;
3425  goto func_exit;
3426  }
3427 
3428  if (indexes[i]->type & DICT_FTS) {
3429  ibool opt_doc_id_size = FALSE;
3430 
3431  /* To build FTS index, we would need to extract
3432  doc's word, Doc ID, and word's position, so
3433  we need to build a "fts sort index" indexing
3434  on above three 'fields' */
3435  fts_sort_idx = row_merge_create_fts_sort_index(
3436  indexes[i], old_table, &opt_doc_id_size);
3437 
3438  row_merge_dup_t* dup = static_cast<row_merge_dup_t*>(
3439  ut_malloc(sizeof *dup));
3440  dup->index = fts_sort_idx;
3441  dup->table = table;
3442  dup->col_map = col_map;
3443  dup->n_dup = 0;
3444 
3446  trx, dup, new_table, opt_doc_id_size,
3447  &psort_info, &merge_info);
3448  }
3449  }
3450 
3451  tmpfd = row_merge_file_create_low();
3452 
3453  if (tmpfd < 0) {
3454  error = DB_OUT_OF_MEMORY;
3455  goto func_exit;
3456  }
3457 
3458  /* Reset the MySQL row buffer that is used when reporting
3459  duplicate keys. */
3460  innobase_rec_reset(table);
3461 
3462  /* Read clustered index of the table and create files for
3463  secondary index entries for merge sort */
3464 
3465  error = row_merge_read_clustered_index(
3466  trx, table, old_table, new_table, online, indexes,
3467  fts_sort_idx, psort_info, merge_files, key_numbers,
3468  n_indexes, add_cols, col_map,
3469  add_autoinc, sequence, block);
3470 
3471  if (error != DB_SUCCESS) {
3472 
3473  goto func_exit;
3474  }
3475 
3476  DEBUG_SYNC_C("row_merge_after_scan");
3477 
3478  /* Now we have files containing index entries ready for
3479  sorting and inserting. */
3480 
3481  for (i = 0; i < n_indexes; i++) {
3482  dict_index_t* sort_idx = indexes[i];
3483 
3484  if (indexes[i]->type & DICT_FTS) {
3485  os_event_t fts_parallel_merge_event;
3486 
3487  sort_idx = fts_sort_idx;
3488 
3489  fts_parallel_merge_event
3490  = merge_info[0].psort_common->merge_event;
3491 
3492  if (FTS_PLL_MERGE) {
3493  ulint trial_count = 0;
3494  bool all_exit = false;
3495 
3496  os_event_reset(fts_parallel_merge_event);
3497  row_fts_start_parallel_merge(merge_info);
3498 wait_again:
3500  fts_parallel_merge_event, 1000000,
3501  sig_count);
3502 
3503  for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
3504  if (merge_info[j].child_status
3505  != FTS_CHILD_COMPLETE
3506  && merge_info[j].child_status
3507  != FTS_CHILD_EXITING) {
3508  sig_count = os_event_reset(
3509  fts_parallel_merge_event);
3510 
3511  goto wait_again;
3512  }
3513  }
3514 
3515  /* Now all children should complete, wait
3516  a bit until they all finish using event */
3517  while (!all_exit && trial_count < 10000) {
3518  all_exit = true;
3519 
3520  for (j = 0; j < FTS_NUM_AUX_INDEX;
3521  j++) {
3522  if (merge_info[j].child_status
3523  != FTS_CHILD_EXITING) {
3524  all_exit = false;
3525  os_thread_sleep(1000);
3526  break;
3527  }
3528  }
3529  trial_count++;
3530  }
3531 
3532  if (!all_exit) {
3533  ib_logf(IB_LOG_LEVEL_ERROR,
3534  "Not all child merge threads"
3535  " exited when creating FTS"
3536  " index '%s'",
3537  indexes[i]->name);
3538  }
3539  } else {
3540  /* This cannot report duplicates; an
3541  assertion would fail in that case. */
3542  error = row_fts_merge_insert(
3543  sort_idx, new_table,
3544  psort_info, 0);
3545  }
3546 
3547 #ifdef FTS_INTERNAL_DIAG_PRINT
3548  DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
3549 #endif
3550  } else {
3551  row_merge_dup_t dup = {
3552  sort_idx, table, col_map, 0};
3553 
3554  error = row_merge_sort(
3555  trx, &dup, &merge_files[i],
3556  block, &tmpfd);
3557 
3558  if (error == DB_SUCCESS) {
3559  error = row_merge_insert_index_tuples(
3560  trx->id, sort_idx, old_table,
3561  merge_files[i].fd, block);
3562  }
3563  }
3564 
3565  /* Close the temporary file to free up space. */
3566  row_merge_file_destroy(&merge_files[i]);
3567 
3568  if (indexes[i]->type & DICT_FTS) {
3569  row_fts_psort_info_destroy(psort_info, merge_info);
3570  } else if (error != DB_SUCCESS || !online) {
3571  /* Do not apply any online log. */
3572  } else if (old_table != new_table) {
3573  ut_ad(!sort_idx->online_log);
3574  ut_ad(sort_idx->online_status
3576  } else {
3577  DEBUG_SYNC_C("row_log_apply_before");
3578  error = row_log_apply(trx, sort_idx, table);
3579  DEBUG_SYNC_C("row_log_apply_after");
3580  }
3581 
3582  if (error != DB_SUCCESS) {
3583  trx->error_key_num = key_numbers[i];
3584  goto func_exit;
3585  }
3586 
3587  if (indexes[i]->type & DICT_FTS && fts_enable_diag_print) {
3588  char* name = (char*) indexes[i]->name;
3589 
3590  if (*name == TEMP_INDEX_PREFIX) {
3591  name++;
3592  }
3593 
3594  ut_print_timestamp(stderr);
3595  fprintf(stderr, " InnoDB: Finished building "
3596  "full-text index %s\n", name);
3597  }
3598  }
3599 
3600 func_exit:
3601  DBUG_EXECUTE_IF(
3602  "ib_build_indexes_too_many_concurrent_trxs",
3604  trx->error_state = error;);
3605 
3607 
3608  for (i = 0; i < n_indexes; i++) {
3609  row_merge_file_destroy(&merge_files[i]);
3610  }
3611 
3612  if (fts_sort_idx) {
3613  dict_mem_index_free(fts_sort_idx);
3614  }
3615 
3616  mem_free(merge_files);
3617  os_mem_free_large(block, block_size);
3618 
3619  DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
3620 
3621  if (online && old_table == new_table && error != DB_SUCCESS) {
3622  /* On error, flag all online secondary index creation
3623  as aborted. */
3624  for (i = 0; i < n_indexes; i++) {
3625  ut_ad(!(indexes[i]->type & DICT_FTS));
3626  ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX);
3627  ut_ad(!dict_index_is_clust(indexes[i]));
3628 
3629  /* Completed indexes should be dropped as
3630  well, and indexes whose creation was aborted
3631  should be dropped from the persistent
3632  storage. However, at this point we can only
3633  set some flags in the not-yet-published
3634  indexes. These indexes will be dropped later
3635  in row_merge_drop_indexes(), called by
3636  rollback_inplace_alter_table(). */
3637 
3638  switch (dict_index_get_online_status(indexes[i])) {
3639  case ONLINE_INDEX_COMPLETE:
3640  break;
3641  case ONLINE_INDEX_CREATION:
3642  rw_lock_x_lock(
3643  dict_index_get_lock(indexes[i]));
3644  row_log_abort_sec(indexes[i]);
3645  indexes[i]->type |= DICT_CORRUPT;
3646  rw_lock_x_unlock(
3647  dict_index_get_lock(indexes[i]));
3648  new_table->drop_aborted = TRUE;
3649  /* fall through */
3651  case ONLINE_INDEX_ABORTED:
3653  &dict_sys->mutex,
3654  MONITOR_BACKGROUND_DROP_INDEX);
3655  }
3656  }
3657  }
3658 
3659  DBUG_RETURN(error);
3660 }