MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
integer.cpp
1 /* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
15 
16 /* based on Wei Dai's integer.cpp from CryptoPP */
17 
18 #include "runtime.hpp"
19 #include "integer.hpp"
20 #include "modarith.hpp"
21 #include "asn.hpp"
22 
23 
24 
25 #ifdef __DECCXX
26  #include <c_asm.h> // for asm overflow assembly
27 #endif
28 
29 #if defined(_M_X64) || defined(_M_IA64)
30  #include <intrin.h>
31 #pragma intrinsic(_umul128)
32 #endif
33 
34 
35 #ifdef __GNUC__
36  #include <signal.h>
37  #include <setjmp.h>
38 #endif
39 
40 
41 #ifdef SSE2_INTRINSICS_AVAILABLE
42  #ifdef __GNUC__
43  #include <xmmintrin.h>
44  #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
45  #include <malloc.h>
46  #else
47  #include <stdlib.h>
48  #endif
49  #else
50  #include <emmintrin.h>
51  #endif
52 #elif defined(_MSC_VER) && defined(_M_IX86)
53  #pragma message("You do not seem to have the Visual C++ Processor Pack ")
54  #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
55 #elif defined(__GNUC__) && defined(__i386__)
56 /* #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \
57  compiler option. Use of SSE2 intrinsics will be disabled.
58 */
59 #endif
60 
61 
62 namespace TaoCrypt {
63 
64 
65 #ifdef SSE2_INTRINSICS_AVAILABLE
66 
67 template <class T>
68 CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(
69  size_type n, const void *)
70 {
71  if (n > max_size())
72  return 0;
73  if (n == 0)
74  return 0;
75  if (n >= 4)
76  {
77  void* p;
78  #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
79  p = _mm_malloc(sizeof(T)*n, 16);
80  #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
81  p = memalign(16, sizeof(T)*n);
82  #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
83  p = malloc(sizeof(T)*n);
84  #else
85  p = (byte *)malloc(sizeof(T)*n + 8);
86  // assume malloc alignment is at least 8
87  #endif
88 
89  #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
90  m_pBlock = p;
91  if (!IsAlignedOn(p, 16))
92  {
93  p = (byte *)p + 8;
94  }
95  #endif
96 
97  return (T*)p;
98  }
99  return NEW_TC T[n];
100 }
101 
102 
103 template <class T>
104 void AlignedAllocator<T>::deallocate(void* p, size_type n)
105 {
106  memset(p, 0, n*sizeof(T));
107  if (n >= 4)
108  {
109  #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
110  _mm_free(p);
111  #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
112  free(m_pBlock);
113  m_pBlock = 0;
114  #else
115  free(p);
116  #endif
117  }
118  else
119  tcArrayDelete((T *)p);
120 }
121 
122 #endif // SSE2
123 
124 
125 // ******** start of integer needs
126 
127 // start 5.2.1 adds DWord and Word ********
128 
129 // ********************************************************
130 
131 class DWord {
132 public:
133 DWord() {}
134 
135 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
136  explicit DWord(word low)
137  {
138  whole_ = low;
139  }
140 #else
141  explicit DWord(word low)
142  {
143  halfs_.low = low;
144  halfs_.high = 0;
145  }
146 #endif
147 
148  DWord(word low, word high)
149  {
150  halfs_.low = low;
151  halfs_.high = high;
152  }
153 
154  static DWord Multiply(word a, word b)
155  {
156  DWord r;
157 
158  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
159  r.whole_ = (dword)a * b;
160 
161  #elif defined(_M_X64) || defined(_M_IA64)
162  r.halfs_.low = _umul128(a, b, &r.halfs_.high);
163 
164  #elif defined(__alpha__)
165  r.halfs_.low = a*b;
166  #ifdef __GNUC__
167  __asm__("umulh %1,%2,%0" : "=r" (r.halfs_.high)
168  : "r" (a), "r" (b));
169  #elif defined(__DECCXX)
170  r.halfs_.high = asm("umulh %a0, %a1, %v0", a, b);
171  #else
172  #error unknown alpha compiler
173  #endif
174 
175  #elif defined(__ia64__)
176  r.halfs_.low = a*b;
177  __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high)
178  : "f" (a), "f" (b));
179 
180  #elif defined(_ARCH_PPC64)
181  r.halfs_.low = a*b;
182  __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high)
183  : "r" (a), "r" (b) : "cc");
184 
185  #elif defined(__x86_64__)
186  __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) :
187  "a" (a), "rm" (b) : "cc");
188 
189  #elif defined(__mips64)
190  __asm__("dmultu %2,%3" : "=h" (r.halfs_.high), "=l" (r.halfs_.low)
191  : "r" (a), "r" (b));
192 
193  #elif defined(_M_IX86)
194  // for testing
195  word64 t = (word64)a * b;
196  r.halfs_.high = ((word32 *)(&t))[1];
197  r.halfs_.low = (word32)t;
198  #else
199  #error can not implement DWord
200  #endif
201 
202  return r;
203  }
204 
205  static DWord MultiplyAndAdd(word a, word b, word c)
206  {
207  DWord r = Multiply(a, b);
208  return r += c;
209  }
210 
211  DWord & operator+=(word a)
212  {
213  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
214  whole_ = whole_ + a;
215  #else
216  halfs_.low += a;
217  halfs_.high += (halfs_.low < a);
218  #endif
219  return *this;
220  }
221 
222  DWord operator+(word a)
223  {
224  DWord r;
225  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
226  r.whole_ = whole_ + a;
227  #else
228  r.halfs_.low = halfs_.low + a;
229  r.halfs_.high = halfs_.high + (r.halfs_.low < a);
230  #endif
231  return r;
232  }
233 
234  DWord operator-(DWord a)
235  {
236  DWord r;
237  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
238  r.whole_ = whole_ - a.whole_;
239  #else
240  r.halfs_.low = halfs_.low - a.halfs_.low;
241  r.halfs_.high = halfs_.high - a.halfs_.high -
242  (r.halfs_.low > halfs_.low);
243  #endif
244  return r;
245  }
246 
247  DWord operator-(word a)
248  {
249  DWord r;
250  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
251  r.whole_ = whole_ - a;
252  #else
253  r.halfs_.low = halfs_.low - a;
254  r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
255  #endif
256  return r;
257  }
258 
259  // returns quotient, which must fit in a word
260  word operator/(word divisor);
261 
262  word operator%(word a);
263 
264  bool operator!() const
265  {
266  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
267  return !whole_;
268  #else
269  return !halfs_.high && !halfs_.low;
270  #endif
271  }
272 
273  word GetLowHalf() const {return halfs_.low;}
274  word GetHighHalf() const {return halfs_.high;}
275  word GetHighHalfAsBorrow() const {return 0-halfs_.high;}
276 
277 private:
278  struct dword_struct
279  {
280  #ifdef LITTLE_ENDIAN_ORDER
281  word low;
282  word high;
283  #else
284  word high;
285  word low;
286  #endif
287  };
288 
289  union
290  {
291  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
292  dword whole_;
293  #endif
294  struct dword_struct halfs_;
295  };
296 };
297 
298 
299 class Word {
300 public:
301  Word() {}
302 
303  Word(word value)
304  {
305  whole_ = value;
306  }
307 
308  Word(hword low, hword high)
309  {
310  whole_ = low | (word(high) << (WORD_BITS/2));
311  }
312 
313  static Word Multiply(hword a, hword b)
314  {
315  Word r;
316  r.whole_ = (word)a * b;
317  return r;
318  }
319 
320  Word operator-(Word a)
321  {
322  Word r;
323  r.whole_ = whole_ - a.whole_;
324  return r;
325  }
326 
327  Word operator-(hword a)
328  {
329  Word r;
330  r.whole_ = whole_ - a;
331  return r;
332  }
333 
334  // returns quotient, which must fit in a word
335  hword operator/(hword divisor)
336  {
337  return hword(whole_ / divisor);
338  }
339 
340  bool operator!() const
341  {
342  return !whole_;
343  }
344 
345  word GetWhole() const {return whole_;}
346  hword GetLowHalf() const {return hword(whole_);}
347  hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));}
348  hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));}
349 
350 private:
351  word whole_;
352 };
353 
354 
355 // dummy is VC60 compiler bug workaround
356 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
357 template <class S, class D>
358 S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0)
359 {
360  // estimate the quotient: do a 2 S by 1 S divide
361  S Q;
362  if (S(B1+1) == 0)
363  Q = A[2];
364  else
365  Q = D(A[1], A[2]) / S(B1+1);
366 
367  // now subtract Q*B from A
368  D p = D::Multiply(B0, Q);
369  D u = (D) A[0] - p.GetLowHalf();
370  A[0] = u.GetLowHalf();
371  u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() -
372  D::Multiply(B1, Q);
373  A[1] = u.GetLowHalf();
374  A[2] += u.GetHighHalf();
375 
376  // Q <= actual quotient, so fix it
377  while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
378  {
379  u = (D) A[0] - B0;
380  A[0] = u.GetLowHalf();
381  u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
382  A[1] = u.GetLowHalf();
383  A[2] += u.GetHighHalf();
384  Q++;
385  }
386 
387  return Q;
388 }
389 
390 
391 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
392 template <class S, class D>
393 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
394 {
395  if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
396  return D(Ah.GetLowHalf(), Ah.GetHighHalf());
397  else
398  {
399  S Q[2];
400  T[0] = Al.GetLowHalf();
401  T[1] = Al.GetHighHalf();
402  T[2] = Ah.GetLowHalf();
403  T[3] = Ah.GetHighHalf();
404  Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
405  B.GetHighHalf());
406  Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
407  return D(Q[0], Q[1]);
408  }
409 }
410 
411 
412 // returns quotient, which must fit in a word
413 inline word DWord::operator/(word a)
414 {
415  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
416  return word(whole_ / a);
417  #else
418  hword r[4];
419  return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
420  halfs_.high, a).GetWhole();
421  #endif
422 }
423 
424 inline word DWord::operator%(word a)
425 {
426  #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
427  return word(whole_ % a);
428  #else
429  if (a < (word(1) << (WORD_BITS/2)))
430  {
431  hword h = hword(a);
432  word r = halfs_.high % h;
433  r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
434  return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
435  }
436  else
437  {
438  hword r[4];
439  DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
440  return Word(r[0], r[1]).GetWhole();
441  }
442  #endif
443 }
444 
445 
446 
447 // end 5.2.1 DWord and Word adds
448 
449 
450 
451 
452 
453 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
454 
455 static inline unsigned int RoundupSize(unsigned int n)
456 {
457  if (n<=8)
458  return RoundupSizeTable[n];
459  else if (n<=16)
460  return 16;
461  else if (n<=32)
462  return 32;
463  else if (n<=64)
464  return 64;
465  else return 1U << BitPrecision(n-1);
466 }
467 
468 
469 static int Compare(const word *A, const word *B, unsigned int N)
470 {
471  while (N--)
472  if (A[N] > B[N])
473  return 1;
474  else if (A[N] < B[N])
475  return -1;
476 
477  return 0;
478 }
479 
480 static word Increment(word *A, unsigned int N, word B=1)
481 {
482  word t = A[0];
483  A[0] = t+B;
484  if (A[0] >= t)
485  return 0;
486  for (unsigned i=1; i<N; i++)
487  if (++A[i])
488  return 0;
489  return 1;
490 }
491 
492 static word Decrement(word *A, unsigned int N, word B=1)
493 {
494  word t = A[0];
495  A[0] = t-B;
496  if (A[0] <= t)
497  return 0;
498  for (unsigned i=1; i<N; i++)
499  if (A[i]--)
500  return 0;
501  return 1;
502 }
503 
504 static void TwosComplement(word *A, unsigned int N)
505 {
506  Decrement(A, N);
507  for (unsigned i=0; i<N; i++)
508  A[i] = ~A[i];
509 }
510 
511 
512 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
513 {
514  word carry=0;
515  for(unsigned i=0; i<N; i++)
516  {
517  DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
518  C[i] = p.GetLowHalf();
519  carry = p.GetHighHalf();
520  }
521  return carry;
522 }
523 
524 
525 static word AtomicInverseModPower2(word A)
526 {
527  word R=A%8;
528 
529  for (unsigned i=3; i<WORD_BITS; i*=2)
530  R = R*(2-R*A);
531 
532  return R;
533 }
534 
535 
536 // ********************************************************
537 
538 class Portable
539 {
540 public:
541  static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
542  unsigned int N);
543  static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word*B,
544  unsigned int N);
545  static void TAOCRYPT_CDECL Multiply2(word *C, const word *A, const word *B);
546  static word TAOCRYPT_CDECL Multiply2Add(word *C,
547  const word *A, const word *B);
548  static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B);
549  static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B);
550  static unsigned int TAOCRYPT_CDECL MultiplyRecursionLimit() {return 8;}
551 
552  static void TAOCRYPT_CDECL Multiply2Bottom(word *C, const word *A,
553  const word *B);
554  static void TAOCRYPT_CDECL Multiply4Bottom(word *C, const word *A,
555  const word *B);
556  static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
557  const word *B);
558  static unsigned int TAOCRYPT_CDECL MultiplyBottomRecursionLimit(){return 8;}
559 
560  static void TAOCRYPT_CDECL Square2(word *R, const word *A);
561  static void TAOCRYPT_CDECL Square4(word *R, const word *A);
562  static unsigned int TAOCRYPT_CDECL SquareRecursionLimit() {return 4;}
563 };
564 
565 word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
566 {
567  DWord u(0, 0);
568  for (unsigned int i = 0; i < N; i+=2)
569  {
570  u = DWord(A[i]) + B[i] + u.GetHighHalf();
571  C[i] = u.GetLowHalf();
572  u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
573  C[i+1] = u.GetLowHalf();
574  }
575  return u.GetHighHalf();
576 }
577 
578 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
579 {
580  DWord u(0, 0);
581  for (unsigned int i = 0; i < N; i+=2)
582  {
583  u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
584  C[i] = u.GetLowHalf();
585  u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
586  C[i+1] = u.GetLowHalf();
587  }
588  return 0-u.GetHighHalf();
589 }
590 
591 void Portable::Multiply2(word *C, const word *A, const word *B)
592 {
593 /*
594  word s;
595  dword d;
596 
597  if (A1 >= A0)
598  if (B0 >= B1)
599  {
600  s = 0;
601  d = (dword)(A1-A0)*(B0-B1);
602  }
603  else
604  {
605  s = (A1-A0);
606  d = (dword)s*(word)(B0-B1);
607  }
608  else
609  if (B0 > B1)
610  {
611  s = (B0-B1);
612  d = (word)(A1-A0)*(dword)s;
613  }
614  else
615  {
616  s = 0;
617  d = (dword)(A0-A1)*(B1-B0);
618  }
619 */
620  // this segment is the branchless equivalent of above
621  word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
622  unsigned int ai = A[1] < A[0];
623  unsigned int bi = B[0] < B[1];
624  unsigned int di = ai & bi;
625  DWord d = DWord::Multiply(D[di], D[di+2]);
626  D[1] = D[3] = 0;
627  unsigned int si = ai + !bi;
628  word s = D[si];
629 
630  DWord A0B0 = DWord::Multiply(A[0], B[0]);
631  C[0] = A0B0.GetLowHalf();
632 
633  DWord A1B1 = DWord::Multiply(A[1], B[1]);
634  DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
635  + A1B1.GetLowHalf();
636  C[1] = t.GetLowHalf();
637 
638  t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
639  + A1B1.GetHighHalf() - s;
640  C[2] = t.GetLowHalf();
641  C[3] = t.GetHighHalf();
642 }
643 
644 void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
645 {
646  DWord t = DWord::Multiply(A[0], B[0]);
647  C[0] = t.GetLowHalf();
648  C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
649 }
650 
651 word Portable::Multiply2Add(word *C, const word *A, const word *B)
652 {
653  word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
654  unsigned int ai = A[1] < A[0];
655  unsigned int bi = B[0] < B[1];
656  unsigned int di = ai & bi;
657  DWord d = DWord::Multiply(D[di], D[di+2]);
658  D[1] = D[3] = 0;
659  unsigned int si = ai + !bi;
660  word s = D[si];
661 
662  DWord A0B0 = DWord::Multiply(A[0], B[0]);
663  DWord t = A0B0 + C[0];
664  C[0] = t.GetLowHalf();
665 
666  DWord A1B1 = DWord::Multiply(A[1], B[1]);
667  t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
668  A1B1.GetLowHalf() + C[1];
669  C[1] = t.GetLowHalf();
670 
671  t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
672  d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
673  C[2] = t.GetLowHalf();
674 
675  t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
676  C[3] = t.GetLowHalf();
677  return t.GetHighHalf();
678 }
679 
680 
681 #define MulAcc(x, y) \
682  p = DWord::MultiplyAndAdd(A[x], B[y], c); \
683  c = p.GetLowHalf(); \
684  p = (DWord) d + p.GetHighHalf(); \
685  d = p.GetLowHalf(); \
686  e += p.GetHighHalf();
687 
688 #define SaveMulAcc(s, x, y) \
689  R[s] = c; \
690  p = DWord::MultiplyAndAdd(A[x], B[y], d); \
691  c = p.GetLowHalf(); \
692  p = (DWord) e + p.GetHighHalf(); \
693  d = p.GetLowHalf(); \
694  e = p.GetHighHalf();
695 
696 #define SquAcc(x, y) \
697  q = DWord::Multiply(A[x], A[y]); \
698  p = q + c; \
699  c = p.GetLowHalf(); \
700  p = (DWord) d + p.GetHighHalf(); \
701  d = p.GetLowHalf(); \
702  e += p.GetHighHalf(); \
703  p = q + c; \
704  c = p.GetLowHalf(); \
705  p = (DWord) d + p.GetHighHalf(); \
706  d = p.GetLowHalf(); \
707  e += p.GetHighHalf();
708 
709 #define SaveSquAcc(s, x, y) \
710  R[s] = c; \
711  q = DWord::Multiply(A[x], A[y]); \
712  p = q + d; \
713  c = p.GetLowHalf(); \
714  p = (DWord) e + p.GetHighHalf(); \
715  d = p.GetLowHalf(); \
716  e = p.GetHighHalf(); \
717  p = q + c; \
718  c = p.GetLowHalf(); \
719  p = (DWord) d + p.GetHighHalf(); \
720  d = p.GetLowHalf(); \
721  e += p.GetHighHalf();
722 
723 
724 void Portable::Multiply4(word *R, const word *A, const word *B)
725 {
726  DWord p;
727  word c, d, e;
728 
729  p = DWord::Multiply(A[0], B[0]);
730  R[0] = p.GetLowHalf();
731  c = p.GetHighHalf();
732  d = e = 0;
733 
734  MulAcc(0, 1);
735  MulAcc(1, 0);
736 
737  SaveMulAcc(1, 2, 0);
738  MulAcc(1, 1);
739  MulAcc(0, 2);
740 
741  SaveMulAcc(2, 0, 3);
742  MulAcc(1, 2);
743  MulAcc(2, 1);
744  MulAcc(3, 0);
745 
746  SaveMulAcc(3, 3, 1);
747  MulAcc(2, 2);
748  MulAcc(1, 3);
749 
750  SaveMulAcc(4, 2, 3);
751  MulAcc(3, 2);
752 
753  R[5] = c;
754  p = DWord::MultiplyAndAdd(A[3], B[3], d);
755  R[6] = p.GetLowHalf();
756  R[7] = e + p.GetHighHalf();
757 }
758 
759 void Portable::Square2(word *R, const word *A)
760 {
761  DWord p, q;
762  word c, d, e;
763 
764  p = DWord::Multiply(A[0], A[0]);
765  R[0] = p.GetLowHalf();
766  c = p.GetHighHalf();
767  d = e = 0;
768 
769  SquAcc(0, 1);
770 
771  R[1] = c;
772  p = DWord::MultiplyAndAdd(A[1], A[1], d);
773  R[2] = p.GetLowHalf();
774  R[3] = e + p.GetHighHalf();
775 }
776 
777 void Portable::Square4(word *R, const word *A)
778 {
779 #ifdef _MSC_VER
780  // VC60 workaround: MSVC 6.0 has an optimization bug that makes
781  // (dword)A*B where either A or B has been cast to a dword before
782  // very expensive. Revisit this function when this
783  // bug is fixed.
784  Multiply4(R, A, A);
785 #else
786  const word *B = A;
787  DWord p, q;
788  word c, d, e;
789 
790  p = DWord::Multiply(A[0], A[0]);
791  R[0] = p.GetLowHalf();
792  c = p.GetHighHalf();
793  d = e = 0;
794 
795  SquAcc(0, 1);
796 
797  SaveSquAcc(1, 2, 0);
798  MulAcc(1, 1);
799 
800  SaveSquAcc(2, 0, 3);
801  SquAcc(1, 2);
802 
803  SaveSquAcc(3, 3, 1);
804  MulAcc(2, 2);
805 
806  SaveSquAcc(4, 2, 3);
807 
808  R[5] = c;
809  p = DWord::MultiplyAndAdd(A[3], A[3], d);
810  R[6] = p.GetLowHalf();
811  R[7] = e + p.GetHighHalf();
812 #endif
813 }
814 
815 void Portable::Multiply8(word *R, const word *A, const word *B)
816 {
817  DWord p;
818  word c, d, e;
819 
820  p = DWord::Multiply(A[0], B[0]);
821  R[0] = p.GetLowHalf();
822  c = p.GetHighHalf();
823  d = e = 0;
824 
825  MulAcc(0, 1);
826  MulAcc(1, 0);
827 
828  SaveMulAcc(1, 2, 0);
829  MulAcc(1, 1);
830  MulAcc(0, 2);
831 
832  SaveMulAcc(2, 0, 3);
833  MulAcc(1, 2);
834  MulAcc(2, 1);
835  MulAcc(3, 0);
836 
837  SaveMulAcc(3, 0, 4);
838  MulAcc(1, 3);
839  MulAcc(2, 2);
840  MulAcc(3, 1);
841  MulAcc(4, 0);
842 
843  SaveMulAcc(4, 0, 5);
844  MulAcc(1, 4);
845  MulAcc(2, 3);
846  MulAcc(3, 2);
847  MulAcc(4, 1);
848  MulAcc(5, 0);
849 
850  SaveMulAcc(5, 0, 6);
851  MulAcc(1, 5);
852  MulAcc(2, 4);
853  MulAcc(3, 3);
854  MulAcc(4, 2);
855  MulAcc(5, 1);
856  MulAcc(6, 0);
857 
858  SaveMulAcc(6, 0, 7);
859  MulAcc(1, 6);
860  MulAcc(2, 5);
861  MulAcc(3, 4);
862  MulAcc(4, 3);
863  MulAcc(5, 2);
864  MulAcc(6, 1);
865  MulAcc(7, 0);
866 
867  SaveMulAcc(7, 1, 7);
868  MulAcc(2, 6);
869  MulAcc(3, 5);
870  MulAcc(4, 4);
871  MulAcc(5, 3);
872  MulAcc(6, 2);
873  MulAcc(7, 1);
874 
875  SaveMulAcc(8, 2, 7);
876  MulAcc(3, 6);
877  MulAcc(4, 5);
878  MulAcc(5, 4);
879  MulAcc(6, 3);
880  MulAcc(7, 2);
881 
882  SaveMulAcc(9, 3, 7);
883  MulAcc(4, 6);
884  MulAcc(5, 5);
885  MulAcc(6, 4);
886  MulAcc(7, 3);
887 
888  SaveMulAcc(10, 4, 7);
889  MulAcc(5, 6);
890  MulAcc(6, 5);
891  MulAcc(7, 4);
892 
893  SaveMulAcc(11, 5, 7);
894  MulAcc(6, 6);
895  MulAcc(7, 5);
896 
897  SaveMulAcc(12, 6, 7);
898  MulAcc(7, 6);
899 
900  R[13] = c;
901  p = DWord::MultiplyAndAdd(A[7], B[7], d);
902  R[14] = p.GetLowHalf();
903  R[15] = e + p.GetHighHalf();
904 }
905 
906 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
907 {
908  DWord p;
909  word c, d, e;
910 
911  p = DWord::Multiply(A[0], B[0]);
912  R[0] = p.GetLowHalf();
913  c = p.GetHighHalf();
914  d = e = 0;
915 
916  MulAcc(0, 1);
917  MulAcc(1, 0);
918 
919  SaveMulAcc(1, 2, 0);
920  MulAcc(1, 1);
921  MulAcc(0, 2);
922 
923  R[2] = c;
924  R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
925 }
926 
927 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
928 {
929  DWord p;
930  word c, d, e;
931 
932  p = DWord::Multiply(A[0], B[0]);
933  R[0] = p.GetLowHalf();
934  c = p.GetHighHalf();
935  d = e = 0;
936 
937  MulAcc(0, 1);
938  MulAcc(1, 0);
939 
940  SaveMulAcc(1, 2, 0);
941  MulAcc(1, 1);
942  MulAcc(0, 2);
943 
944  SaveMulAcc(2, 0, 3);
945  MulAcc(1, 2);
946  MulAcc(2, 1);
947  MulAcc(3, 0);
948 
949  SaveMulAcc(3, 0, 4);
950  MulAcc(1, 3);
951  MulAcc(2, 2);
952  MulAcc(3, 1);
953  MulAcc(4, 0);
954 
955  SaveMulAcc(4, 0, 5);
956  MulAcc(1, 4);
957  MulAcc(2, 3);
958  MulAcc(3, 2);
959  MulAcc(4, 1);
960  MulAcc(5, 0);
961 
962  SaveMulAcc(5, 0, 6);
963  MulAcc(1, 5);
964  MulAcc(2, 4);
965  MulAcc(3, 3);
966  MulAcc(4, 2);
967  MulAcc(5, 1);
968  MulAcc(6, 0);
969 
970  R[6] = c;
971  R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
972  A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
973 }
974 
975 
976 #undef MulAcc
977 #undef SaveMulAcc
978 #undef SquAcc
979 #undef SaveSquAcc
980 
981 // optimized
982 
983 #ifdef TAOCRYPT_X86ASM_AVAILABLE
984 
985 // ************** x86 feature detection ***************
986 
987 
988 #ifdef SSE2_INTRINSICS_AVAILABLE
989 
990 #ifndef _MSC_VER
991  static jmp_buf s_env;
992  static void SigIllHandler(int)
993  {
994  longjmp(s_env, 1);
995  }
996 #endif
997 
998 static bool HasSSE2()
999 {
1000  if (!IsPentium())
1001  return false;
1002 
1003  word32 cpuid[4];
1004  CpuId(1, cpuid);
1005  if ((cpuid[3] & (1 << 26)) == 0)
1006  return false;
1007 
1008 #ifdef _MSC_VER
1009  __try
1010  {
1011  __asm xorpd xmm0, xmm0 // executing SSE2 instruction
1012  }
1013  __except (1)
1014  {
1015  return false;
1016  }
1017  return true;
1018 #else
1019  typedef void (*SigHandler)(int);
1020 
1021  SigHandler oldHandler = signal(SIGILL, SigIllHandler);
1022  if (oldHandler == SIG_ERR)
1023  return false;
1024 
1025  bool result = true;
1026  if (setjmp(s_env))
1027  result = false;
1028  else
1029  __asm __volatile ("xorpd %xmm0, %xmm0");
1030 
1031  signal(SIGILL, oldHandler);
1032  return result;
1033 #endif
1034 }
1035 #endif // SSE2_INTRINSICS_AVAILABLE
1036 
1037 
1038 static bool IsP4()
1039 {
1040  if (!IsPentium())
1041  return false;
1042 
1043  word32 cpuid[4];
1044 
1045  CpuId(1, cpuid);
1046  return ((cpuid[0] >> 8) & 0xf) == 0xf;
1047 }
1048 
1049 // ************** Pentium/P4 optimizations ***************
1050 
1051 class PentiumOptimized : public Portable
1052 {
1053 public:
1054  static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1055  unsigned int N);
1056  static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1057  unsigned int N);
1058  static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1059  const word *B);
1060  static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1061  const word *B);
1062  static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1063  const word *B);
1064 };
1065 
1066 class P4Optimized
1067 {
1068 public:
1069  static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1070  unsigned int N);
1071  static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1072  unsigned int N);
1073 #ifdef SSE2_INTRINSICS_AVAILABLE
1074  static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1075  const word *B);
1076  static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1077  const word *B);
1078  static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1079  const word *B);
1080 #endif
1081 };
1082 
1083 typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
1084  unsigned int N);
1085 typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
1086 
1087 static PAddSub s_pAdd, s_pSub;
1088 #ifdef SSE2_INTRINSICS_AVAILABLE
1089 static PMul s_pMul4, s_pMul8, s_pMul8B;
1090 #endif
1091 
1092 static void SetPentiumFunctionPointers()
1093 {
1094  if (!IsPentium())
1095  {
1096  s_pAdd = &Portable::Add;
1097  s_pSub = &Portable::Subtract;
1098  }
1099  else if (IsP4())
1100  {
1101  s_pAdd = &P4Optimized::Add;
1102  s_pSub = &P4Optimized::Subtract;
1103  }
1104  else
1105  {
1106  s_pAdd = &PentiumOptimized::Add;
1107  s_pSub = &PentiumOptimized::Subtract;
1108  }
1109 
1110 #ifdef SSE2_INTRINSICS_AVAILABLE
1111  if (!IsPentium())
1112  {
1113  s_pMul4 = &Portable::Multiply4;
1114  s_pMul8 = &Portable::Multiply8;
1115  s_pMul8B = &Portable::Multiply8Bottom;
1116  }
1117  else if (HasSSE2())
1118  {
1119  s_pMul4 = &P4Optimized::Multiply4;
1120  s_pMul8 = &P4Optimized::Multiply8;
1121  s_pMul8B = &P4Optimized::Multiply8Bottom;
1122  }
1123  else
1124  {
1125  s_pMul4 = &PentiumOptimized::Multiply4;
1126  s_pMul8 = &PentiumOptimized::Multiply8;
1127  s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1128  }
1129 #endif
1130 }
1131 
1132 static const char s_RunAtStartupSetPentiumFunctionPointers =
1133  (SetPentiumFunctionPointers(), 0);
1134 
1135 
1136 class LowLevel : public PentiumOptimized
1137 {
1138 public:
1139  inline static word Add(word *C, const word *A, const word *B,
1140  unsigned int N)
1141  {return s_pAdd(C, A, B, N);}
1142  inline static word Subtract(word *C, const word *A, const word *B,
1143  unsigned int N)
1144  {return s_pSub(C, A, B, N);}
1145  inline static void Square4(word *R, const word *A)
1146  {Multiply4(R, A, A);}
1147 #ifdef SSE2_INTRINSICS_AVAILABLE
1148  inline static void Multiply4(word *C, const word *A, const word *B)
1149  {s_pMul4(C, A, B);}
1150  inline static void Multiply8(word *C, const word *A, const word *B)
1151  {s_pMul8(C, A, B);}
1152  inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1153  {s_pMul8B(C, A, B);}
1154 #endif
1155 };
1156 
1157 // use some tricks to share assembly code between MSVC and GCC
1158 #ifdef _MSC_VER
1159  #define TAOCRYPT_NAKED __declspec(naked)
1160  #define AS1(x) __asm x
1161  #define AS2(x, y) __asm x, y
1162  #define AddPrologue \
1163  __asm push ebp \
1164  __asm push ebx \
1165  __asm push esi \
1166  __asm push edi \
1167  __asm mov ecx, [esp+20] \
1168  __asm mov edx, [esp+24] \
1169  __asm mov ebx, [esp+28] \
1170  __asm mov esi, [esp+32]
1171  #define AddEpilogue \
1172  __asm pop edi \
1173  __asm pop esi \
1174  __asm pop ebx \
1175  __asm pop ebp \
1176  __asm ret
1177  #define MulPrologue \
1178  __asm push ebp \
1179  __asm push ebx \
1180  __asm push esi \
1181  __asm push edi \
1182  __asm mov ecx, [esp+28] \
1183  __asm mov esi, [esp+24] \
1184  __asm push [esp+20]
1185  #define MulEpilogue \
1186  __asm add esp, 4 \
1187  __asm pop edi \
1188  __asm pop esi \
1189  __asm pop ebx \
1190  __asm pop ebp \
1191  __asm ret
1192 #else
1193  #define TAOCRYPT_NAKED
1194  #define AS1(x) #x ";"
1195  #define AS2(x, y) #x ", " #y ";"
1196  #define AddPrologue \
1197  word res; \
1198  __asm__ __volatile__ \
1199  ( \
1200  "push %%ebx;" /* save this manually, in case of -fPIC */ \
1201  "mov %3, %%ebx;" \
1202  ".intel_syntax noprefix;" \
1203  "push ebp;"
1204  #define AddEpilogue \
1205  "pop ebp;" \
1206  ".att_syntax prefix;" \
1207  "pop %%ebx;" \
1208  "mov %%eax, %0;" \
1209  : "=g" (res) \
1210  : "c" (C), "d" (A), "m" (B), "S" (N) \
1211  : "%edi", "memory", "cc" \
1212  ); \
1213  return res;
1214 
1215  #define MulPrologue \
1216  __asm__ __volatile__ \
1217  ( \
1218  "push %%ebx;" /* save this manually, in case of -fPIC */ \
1219  "push %%ebp;" \
1220  "push %0;" \
1221  ".intel_syntax noprefix;"
1222  #define MulEpilogue \
1223  "add esp, 4;" \
1224  "pop ebp;" \
1225  "pop ebx;" \
1226  ".att_syntax prefix;" \
1227  : \
1228  : "rm" (Z), "S" (X), "c" (Y) \
1229  : "%eax", "%edx", "%edi", "memory", "cc" \
1230  );
1231 #endif
1232 
1233 TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
1234  const word *B, unsigned int N)
1235 {
1236  AddPrologue
1237 
1238  // now: ebx = B, ecx = C, edx = A, esi = N
1239  AS2( sub ecx, edx) // hold the distance between C & A so we
1240  // can add this to A to get C
1241  AS2( xor eax, eax) // clear eax
1242 
1243  AS2( sub eax, esi) // eax is a negative index from end of B
1244  AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1245 
1246  AS2( sar eax, 1) // unit of eax is now dwords; this also
1247  // clears the carry flag
1248  AS1( jz loopendAdd) // if no dwords then nothing to do
1249 
1250  AS1(loopstartAdd:)
1251  AS2( mov esi,[edx]) // load lower word of A
1252  AS2( mov ebp,[edx+4]) // load higher word of A
1253 
1254  AS2( mov edi,[ebx+8*eax]) // load lower word of B
1255  AS2( lea edx,[edx+8]) // advance A and C
1256 
1257  AS2( adc esi,edi) // add lower words
1258  AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1259 
1260  AS2( adc ebp,edi) // add higher words
1261  AS1( inc eax) // advance B
1262 
1263  AS2( mov [edx+ecx-8],esi) // store lower word result
1264  AS2( mov [edx+ecx-4],ebp) // store higher word result
1265 
1266  AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1267 
1268  AS1(loopendAdd:)
1269  AS2( adc eax, 0) // store carry into eax (return result register)
1270 
1271  AddEpilogue
1272 }
1273 
1274 TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
1275  const word *B, unsigned int N)
1276 {
1277  AddPrologue
1278 
1279  // now: ebx = B, ecx = C, edx = A, esi = N
1280  AS2( sub ecx, edx) // hold the distance between C & A so we
1281  // can add this to A to get C
1282  AS2( xor eax, eax) // clear eax
1283 
1284  AS2( sub eax, esi) // eax is a negative index from end of B
1285  AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1286 
1287  AS2( sar eax, 1) // unit of eax is now dwords; this also
1288  // clears the carry flag
1289  AS1( jz loopendSub) // if no dwords then nothing to do
1290 
1291  AS1(loopstartSub:)
1292  AS2( mov esi,[edx]) // load lower word of A
1293  AS2( mov ebp,[edx+4]) // load higher word of A
1294 
1295  AS2( mov edi,[ebx+8*eax]) // load lower word of B
1296  AS2( lea edx,[edx+8]) // advance A and C
1297 
1298  AS2( sbb esi,edi) // subtract lower words
1299  AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1300 
1301  AS2( sbb ebp,edi) // subtract higher words
1302  AS1( inc eax) // advance B
1303 
1304  AS2( mov [edx+ecx-8],esi) // store lower word result
1305  AS2( mov [edx+ecx-4],ebp) // store higher word result
1306 
1307  AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1308 
1309  AS1(loopendSub:)
1310  AS2( adc eax, 0) // store carry into eax (return result register)
1311 
1312  AddEpilogue
1313 }
1314 
1315 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1316 
1317 TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
1318  unsigned int N)
1319 {
1320  AddPrologue
1321 
1322  // now: ebx = B, ecx = C, edx = A, esi = N
1323  AS2( xor eax, eax)
1324  AS1( neg esi)
1325  AS1( jz loopendAddP4) // if no dwords then nothing to do
1326 
1327  AS2( mov edi, [edx])
1328  AS2( mov ebp, [ebx])
1329  AS1( jmp carry1AddP4)
1330 
1331  AS1(loopstartAddP4:)
1332  AS2( mov edi, [edx+8])
1333  AS2( add ecx, 8)
1334  AS2( add edx, 8)
1335  AS2( mov ebp, [ebx])
1336  AS2( add edi, eax)
1337  AS1( jc carry1AddP4)
1338  AS2( xor eax, eax)
1339 
1340  AS1(carry1AddP4:)
1341  AS2( add edi, ebp)
1342  AS2( mov ebp, 1)
1343  AS2( mov [ecx], edi)
1344  AS2( mov edi, [edx+4])
1345  AS2( cmovc eax, ebp)
1346  AS2( mov ebp, [ebx+4])
1347  AS2( add ebx, 8)
1348  AS2( add edi, eax)
1349  AS1( jc carry2AddP4)
1350  AS2( xor eax, eax)
1351 
1352  AS1(carry2AddP4:)
1353  AS2( add edi, ebp)
1354  AS2( mov ebp, 1)
1355  AS2( cmovc eax, ebp)
1356  AS2( mov [ecx+4], edi)
1357  AS2( add esi, 2)
1358  AS1( jnz loopstartAddP4)
1359 
1360  AS1(loopendAddP4:)
1361 
1362  AddEpilogue
1363 }
1364 
1365 TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
1366  const word *B, unsigned int N)
1367 {
1368  AddPrologue
1369 
1370  // now: ebx = B, ecx = C, edx = A, esi = N
1371  AS2( xor eax, eax)
1372  AS1( neg esi)
1373  AS1( jz loopendSubP4) // if no dwords then nothing to do
1374 
1375  AS2( mov edi, [edx])
1376  AS2( mov ebp, [ebx])
1377  AS1( jmp carry1SubP4)
1378 
1379  AS1(loopstartSubP4:)
1380  AS2( mov edi, [edx+8])
1381  AS2( add edx, 8)
1382  AS2( add ecx, 8)
1383  AS2( mov ebp, [ebx])
1384  AS2( sub edi, eax)
1385  AS1( jc carry1SubP4)
1386  AS2( xor eax, eax)
1387 
1388  AS1(carry1SubP4:)
1389  AS2( sub edi, ebp)
1390  AS2( mov ebp, 1)
1391  AS2( mov [ecx], edi)
1392  AS2( mov edi, [edx+4])
1393  AS2( cmovc eax, ebp)
1394  AS2( mov ebp, [ebx+4])
1395  AS2( add ebx, 8)
1396  AS2( sub edi, eax)
1397  AS1( jc carry2SubP4)
1398  AS2( xor eax, eax)
1399 
1400  AS1(carry2SubP4:)
1401  AS2( sub edi, ebp)
1402  AS2( mov ebp, 1)
1403  AS2( cmovc eax, ebp)
1404  AS2( mov [ecx+4], edi)
1405  AS2( add esi, 2)
1406  AS1( jnz loopstartSubP4)
1407 
1408  AS1(loopendSubP4:)
1409 
1410  AddEpilogue
1411 }
1412 
1413 // multiply assembly code originally contributed by Leonard Janke
1414 
1415 #define MulStartup \
1416  AS2(xor ebp, ebp) \
1417  AS2(xor edi, edi) \
1418  AS2(xor ebx, ebx)
1419 
1420 #define MulShiftCarry \
1421  AS2(mov ebp, edx) \
1422  AS2(mov edi, ebx) \
1423  AS2(xor ebx, ebx)
1424 
1425 #define MulAccumulateBottom(i,j) \
1426  AS2(mov eax, [ecx+4*j]) \
1427  AS2(imul eax, dword ptr [esi+4*i]) \
1428  AS2(add ebp, eax)
1429 
1430 #define MulAccumulate(i,j) \
1431  AS2(mov eax, [ecx+4*j]) \
1432  AS1(mul dword ptr [esi+4*i]) \
1433  AS2(add ebp, eax) \
1434  AS2(adc edi, edx) \
1435  AS2(adc bl, bh)
1436 
1437 #define MulStoreDigit(i) \
1438  AS2(mov edx, edi) \
1439  AS2(mov edi, [esp]) \
1440  AS2(mov [edi+4*i], ebp)
1441 
1442 #define MulLastDiagonal(digits) \
1443  AS2(mov eax, [ecx+4*(digits-1)]) \
1444  AS1(mul dword ptr [esi+4*(digits-1)]) \
1445  AS2(add ebp, eax) \
1446  AS2(adc edx, edi) \
1447  AS2(mov edi, [esp]) \
1448  AS2(mov [edi+4*(2*digits-2)], ebp) \
1449  AS2(mov [edi+4*(2*digits-1)], edx)
1450 
1451 TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
1452  const word* Y)
1453 {
1454  MulPrologue
1455  // now: [esp] = Z, esi = X, ecx = Y
1456  MulStartup
1457  MulAccumulate(0,0)
1458  MulStoreDigit(0)
1459  MulShiftCarry
1460 
1461  MulAccumulate(1,0)
1462  MulAccumulate(0,1)
1463  MulStoreDigit(1)
1464  MulShiftCarry
1465 
1466  MulAccumulate(2,0)
1467  MulAccumulate(1,1)
1468  MulAccumulate(0,2)
1469  MulStoreDigit(2)
1470  MulShiftCarry
1471 
1472  MulAccumulate(3,0)
1473  MulAccumulate(2,1)
1474  MulAccumulate(1,2)
1475  MulAccumulate(0,3)
1476  MulStoreDigit(3)
1477  MulShiftCarry
1478 
1479  MulAccumulate(3,1)
1480  MulAccumulate(2,2)
1481  MulAccumulate(1,3)
1482  MulStoreDigit(4)
1483  MulShiftCarry
1484 
1485  MulAccumulate(3,2)
1486  MulAccumulate(2,3)
1487  MulStoreDigit(5)
1488  MulShiftCarry
1489 
1490  MulLastDiagonal(4)
1491  MulEpilogue
1492 }
1493 
1494 TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,
1495  const word* Y)
1496 {
1497  MulPrologue
1498  // now: [esp] = Z, esi = X, ecx = Y
1499  MulStartup
1500  MulAccumulate(0,0)
1501  MulStoreDigit(0)
1502  MulShiftCarry
1503 
1504  MulAccumulate(1,0)
1505  MulAccumulate(0,1)
1506  MulStoreDigit(1)
1507  MulShiftCarry
1508 
1509  MulAccumulate(2,0)
1510  MulAccumulate(1,1)
1511  MulAccumulate(0,2)
1512  MulStoreDigit(2)
1513  MulShiftCarry
1514 
1515  MulAccumulate(3,0)
1516  MulAccumulate(2,1)
1517  MulAccumulate(1,2)
1518  MulAccumulate(0,3)
1519  MulStoreDigit(3)
1520  MulShiftCarry
1521 
1522  MulAccumulate(4,0)
1523  MulAccumulate(3,1)
1524  MulAccumulate(2,2)
1525  MulAccumulate(1,3)
1526  MulAccumulate(0,4)
1527  MulStoreDigit(4)
1528  MulShiftCarry
1529 
1530  MulAccumulate(5,0)
1531  MulAccumulate(4,1)
1532  MulAccumulate(3,2)
1533  MulAccumulate(2,3)
1534  MulAccumulate(1,4)
1535  MulAccumulate(0,5)
1536  MulStoreDigit(5)
1537  MulShiftCarry
1538 
1539  MulAccumulate(6,0)
1540  MulAccumulate(5,1)
1541  MulAccumulate(4,2)
1542  MulAccumulate(3,3)
1543  MulAccumulate(2,4)
1544  MulAccumulate(1,5)
1545  MulAccumulate(0,6)
1546  MulStoreDigit(6)
1547  MulShiftCarry
1548 
1549  MulAccumulate(7,0)
1550  MulAccumulate(6,1)
1551  MulAccumulate(5,2)
1552  MulAccumulate(4,3)
1553  MulAccumulate(3,4)
1554  MulAccumulate(2,5)
1555  MulAccumulate(1,6)
1556  MulAccumulate(0,7)
1557  MulStoreDigit(7)
1558  MulShiftCarry
1559 
1560  MulAccumulate(7,1)
1561  MulAccumulate(6,2)
1562  MulAccumulate(5,3)
1563  MulAccumulate(4,4)
1564  MulAccumulate(3,5)
1565  MulAccumulate(2,6)
1566  MulAccumulate(1,7)
1567  MulStoreDigit(8)
1568  MulShiftCarry
1569 
1570  MulAccumulate(7,2)
1571  MulAccumulate(6,3)
1572  MulAccumulate(5,4)
1573  MulAccumulate(4,5)
1574  MulAccumulate(3,6)
1575  MulAccumulate(2,7)
1576  MulStoreDigit(9)
1577  MulShiftCarry
1578 
1579  MulAccumulate(7,3)
1580  MulAccumulate(6,4)
1581  MulAccumulate(5,5)
1582  MulAccumulate(4,6)
1583  MulAccumulate(3,7)
1584  MulStoreDigit(10)
1585  MulShiftCarry
1586 
1587  MulAccumulate(7,4)
1588  MulAccumulate(6,5)
1589  MulAccumulate(5,6)
1590  MulAccumulate(4,7)
1591  MulStoreDigit(11)
1592  MulShiftCarry
1593 
1594  MulAccumulate(7,5)
1595  MulAccumulate(6,6)
1596  MulAccumulate(5,7)
1597  MulStoreDigit(12)
1598  MulShiftCarry
1599 
1600  MulAccumulate(7,6)
1601  MulAccumulate(6,7)
1602  MulStoreDigit(13)
1603  MulShiftCarry
1604 
1605  MulLastDiagonal(8)
1606  MulEpilogue
1607 }
1608 
1609 TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X,
1610  const word* Y)
1611 {
1612  MulPrologue
1613  // now: [esp] = Z, esi = X, ecx = Y
1614  MulStartup
1615  MulAccumulate(0,0)
1616  MulStoreDigit(0)
1617  MulShiftCarry
1618 
1619  MulAccumulate(1,0)
1620  MulAccumulate(0,1)
1621  MulStoreDigit(1)
1622  MulShiftCarry
1623 
1624  MulAccumulate(2,0)
1625  MulAccumulate(1,1)
1626  MulAccumulate(0,2)
1627  MulStoreDigit(2)
1628  MulShiftCarry
1629 
1630  MulAccumulate(3,0)
1631  MulAccumulate(2,1)
1632  MulAccumulate(1,2)
1633  MulAccumulate(0,3)
1634  MulStoreDigit(3)
1635  MulShiftCarry
1636 
1637  MulAccumulate(4,0)
1638  MulAccumulate(3,1)
1639  MulAccumulate(2,2)
1640  MulAccumulate(1,3)
1641  MulAccumulate(0,4)
1642  MulStoreDigit(4)
1643  MulShiftCarry
1644 
1645  MulAccumulate(5,0)
1646  MulAccumulate(4,1)
1647  MulAccumulate(3,2)
1648  MulAccumulate(2,3)
1649  MulAccumulate(1,4)
1650  MulAccumulate(0,5)
1651  MulStoreDigit(5)
1652  MulShiftCarry
1653 
1654  MulAccumulate(6,0)
1655  MulAccumulate(5,1)
1656  MulAccumulate(4,2)
1657  MulAccumulate(3,3)
1658  MulAccumulate(2,4)
1659  MulAccumulate(1,5)
1660  MulAccumulate(0,6)
1661  MulStoreDigit(6)
1662  MulShiftCarry
1663 
1664  MulAccumulateBottom(7,0)
1665  MulAccumulateBottom(6,1)
1666  MulAccumulateBottom(5,2)
1667  MulAccumulateBottom(4,3)
1668  MulAccumulateBottom(3,4)
1669  MulAccumulateBottom(2,5)
1670  MulAccumulateBottom(1,6)
1671  MulAccumulateBottom(0,7)
1672  MulStoreDigit(7)
1673  MulEpilogue
1674 }
1675 
1676 #undef AS1
1677 #undef AS2
1678 
1679 #else // not x86 - no processor specific code at this layer
1680 
1681 typedef Portable LowLevel;
1682 
1683 #endif
1684 
1685 #ifdef SSE2_INTRINSICS_AVAILABLE
1686 
1687 #ifdef __GNUC__
1688 #define TAOCRYPT_FASTCALL
1689 #else
1690 #define TAOCRYPT_FASTCALL __fastcall
1691 #endif
1692 
1693 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A,
1694  const __m128i *B)
1695 {
1696  __m128i a3210 = _mm_load_si128(A);
1697  __m128i b3210 = _mm_load_si128(B);
1698 
1699  __m128i sum;
1700 
1701  __m128i z = _mm_setzero_si128();
1702  __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1703  C[0] = a2b2_a0b0;
1704 
1705  __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1706  __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1707  __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1708  __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1709  __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1710  C[1] = _mm_add_epi64(a1b0, a0b1);
1711 
1712  __m128i a31 = _mm_srli_epi64(a3210, 32);
1713  __m128i b31 = _mm_srli_epi64(b3210, 32);
1714  __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1715  C[6] = a3b3_a1b1;
1716 
1717  __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1718  __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1719  __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1720  __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1721  __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1722  sum = _mm_add_epi64(a1b1, a0b2);
1723  C[2] = _mm_add_epi64(sum, a2b0);
1724 
1725  __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1726  __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1727  __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1728  __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1729  __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1730  __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1731  __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1732  __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1733  __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1734  sum = _mm_add_epi64(a2b1, a0b3);
1735  C[3] = _mm_add_epi64(sum, sum1);
1736 
1737  __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1738  __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1739  __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1740  __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1741  sum = _mm_add_epi64(a2b2, a3b1);
1742  C[4] = _mm_add_epi64(sum, a1b3);
1743 
1744  __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1745  __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1746  __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1747  __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1748  __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1749  C[5] = _mm_add_epi64(a3b2, a2b3);
1750 }
1751 
1752 void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1753 {
1754  __m128i temp[7];
1755  const word *w = (word *)temp;
1756  const __m64 *mw = (__m64 *)w;
1757 
1758  P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1759 
1760  C[0] = w[0];
1761 
1762  __m64 s1, s2;
1763 
1764  __m64 w1 = _mm_cvtsi32_si64(w[1]);
1765  __m64 w4 = mw[2];
1766  __m64 w6 = mw[3];
1767  __m64 w8 = mw[4];
1768  __m64 w10 = mw[5];
1769  __m64 w12 = mw[6];
1770  __m64 w14 = mw[7];
1771  __m64 w16 = mw[8];
1772  __m64 w18 = mw[9];
1773  __m64 w20 = mw[10];
1774  __m64 w22 = mw[11];
1775  __m64 w26 = _mm_cvtsi32_si64(w[26]);
1776 
1777  s1 = _mm_add_si64(w1, w4);
1778  C[1] = _mm_cvtsi64_si32(s1);
1779  s1 = _mm_srli_si64(s1, 32);
1780 
1781  s2 = _mm_add_si64(w6, w8);
1782  s1 = _mm_add_si64(s1, s2);
1783  C[2] = _mm_cvtsi64_si32(s1);
1784  s1 = _mm_srli_si64(s1, 32);
1785 
1786  s2 = _mm_add_si64(w10, w12);
1787  s1 = _mm_add_si64(s1, s2);
1788  C[3] = _mm_cvtsi64_si32(s1);
1789  s1 = _mm_srli_si64(s1, 32);
1790 
1791  s2 = _mm_add_si64(w14, w16);
1792  s1 = _mm_add_si64(s1, s2);
1793  C[4] = _mm_cvtsi64_si32(s1);
1794  s1 = _mm_srli_si64(s1, 32);
1795 
1796  s2 = _mm_add_si64(w18, w20);
1797  s1 = _mm_add_si64(s1, s2);
1798  C[5] = _mm_cvtsi64_si32(s1);
1799  s1 = _mm_srli_si64(s1, 32);
1800 
1801  s2 = _mm_add_si64(w22, w26);
1802  s1 = _mm_add_si64(s1, s2);
1803  C[6] = _mm_cvtsi64_si32(s1);
1804  s1 = _mm_srli_si64(s1, 32);
1805 
1806  C[7] = _mm_cvtsi64_si32(s1) + w[27];
1807  _mm_empty();
1808 }
1809 
1810 void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1811 {
1812  __m128i temp[28];
1813  const word *w = (word *)temp;
1814  const __m64 *mw = (__m64 *)w;
1815  const word *x = (word *)temp+7*4;
1816  const __m64 *mx = (__m64 *)x;
1817  const word *y = (word *)temp+7*4*2;
1818  const __m64 *my = (__m64 *)y;
1819  const word *z = (word *)temp+7*4*3;
1820  const __m64 *mz = (__m64 *)z;
1821 
1822  P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1823 
1824  P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1825 
1826  P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1827 
1828  P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1829 
1830  C[0] = w[0];
1831 
1832  __m64 s1, s2, s3, s4;
1833 
1834  __m64 w1 = _mm_cvtsi32_si64(w[1]);
1835  __m64 w4 = mw[2];
1836  __m64 w6 = mw[3];
1837  __m64 w8 = mw[4];
1838  __m64 w10 = mw[5];
1839  __m64 w12 = mw[6];
1840  __m64 w14 = mw[7];
1841  __m64 w16 = mw[8];
1842  __m64 w18 = mw[9];
1843  __m64 w20 = mw[10];
1844  __m64 w22 = mw[11];
1845  __m64 w26 = _mm_cvtsi32_si64(w[26]);
1846  __m64 w27 = _mm_cvtsi32_si64(w[27]);
1847 
1848  __m64 x0 = _mm_cvtsi32_si64(x[0]);
1849  __m64 x1 = _mm_cvtsi32_si64(x[1]);
1850  __m64 x4 = mx[2];
1851  __m64 x6 = mx[3];
1852  __m64 x8 = mx[4];
1853  __m64 x10 = mx[5];
1854  __m64 x12 = mx[6];
1855  __m64 x14 = mx[7];
1856  __m64 x16 = mx[8];
1857  __m64 x18 = mx[9];
1858  __m64 x20 = mx[10];
1859  __m64 x22 = mx[11];
1860  __m64 x26 = _mm_cvtsi32_si64(x[26]);
1861  __m64 x27 = _mm_cvtsi32_si64(x[27]);
1862 
1863  __m64 y0 = _mm_cvtsi32_si64(y[0]);
1864  __m64 y1 = _mm_cvtsi32_si64(y[1]);
1865  __m64 y4 = my[2];
1866  __m64 y6 = my[3];
1867  __m64 y8 = my[4];
1868  __m64 y10 = my[5];
1869  __m64 y12 = my[6];
1870  __m64 y14 = my[7];
1871  __m64 y16 = my[8];
1872  __m64 y18 = my[9];
1873  __m64 y20 = my[10];
1874  __m64 y22 = my[11];
1875  __m64 y26 = _mm_cvtsi32_si64(y[26]);
1876  __m64 y27 = _mm_cvtsi32_si64(y[27]);
1877 
1878  __m64 z0 = _mm_cvtsi32_si64(z[0]);
1879  __m64 z1 = _mm_cvtsi32_si64(z[1]);
1880  __m64 z4 = mz[2];
1881  __m64 z6 = mz[3];
1882  __m64 z8 = mz[4];
1883  __m64 z10 = mz[5];
1884  __m64 z12 = mz[6];
1885  __m64 z14 = mz[7];
1886  __m64 z16 = mz[8];
1887  __m64 z18 = mz[9];
1888  __m64 z20 = mz[10];
1889  __m64 z22 = mz[11];
1890  __m64 z26 = _mm_cvtsi32_si64(z[26]);
1891 
1892  s1 = _mm_add_si64(w1, w4);
1893  C[1] = _mm_cvtsi64_si32(s1);
1894  s1 = _mm_srli_si64(s1, 32);
1895 
1896  s2 = _mm_add_si64(w6, w8);
1897  s1 = _mm_add_si64(s1, s2);
1898  C[2] = _mm_cvtsi64_si32(s1);
1899  s1 = _mm_srli_si64(s1, 32);
1900 
1901  s2 = _mm_add_si64(w10, w12);
1902  s1 = _mm_add_si64(s1, s2);
1903  C[3] = _mm_cvtsi64_si32(s1);
1904  s1 = _mm_srli_si64(s1, 32);
1905 
1906  s3 = _mm_add_si64(x0, y0);
1907  s2 = _mm_add_si64(w14, w16);
1908  s1 = _mm_add_si64(s1, s3);
1909  s1 = _mm_add_si64(s1, s2);
1910  C[4] = _mm_cvtsi64_si32(s1);
1911  s1 = _mm_srli_si64(s1, 32);
1912 
1913  s3 = _mm_add_si64(x1, y1);
1914  s4 = _mm_add_si64(x4, y4);
1915  s1 = _mm_add_si64(s1, w18);
1916  s3 = _mm_add_si64(s3, s4);
1917  s1 = _mm_add_si64(s1, w20);
1918  s1 = _mm_add_si64(s1, s3);
1919  C[5] = _mm_cvtsi64_si32(s1);
1920  s1 = _mm_srli_si64(s1, 32);
1921 
1922  s3 = _mm_add_si64(x6, y6);
1923  s4 = _mm_add_si64(x8, y8);
1924  s1 = _mm_add_si64(s1, w22);
1925  s3 = _mm_add_si64(s3, s4);
1926  s1 = _mm_add_si64(s1, w26);
1927  s1 = _mm_add_si64(s1, s3);
1928  C[6] = _mm_cvtsi64_si32(s1);
1929  s1 = _mm_srli_si64(s1, 32);
1930 
1931  s3 = _mm_add_si64(x10, y10);
1932  s4 = _mm_add_si64(x12, y12);
1933  s1 = _mm_add_si64(s1, w27);
1934  s3 = _mm_add_si64(s3, s4);
1935  s1 = _mm_add_si64(s1, s3);
1936  C[7] = _mm_cvtsi64_si32(s1);
1937  s1 = _mm_srli_si64(s1, 32);
1938 
1939  s3 = _mm_add_si64(x14, y14);
1940  s4 = _mm_add_si64(x16, y16);
1941  s1 = _mm_add_si64(s1, z0);
1942  s3 = _mm_add_si64(s3, s4);
1943  s1 = _mm_add_si64(s1, s3);
1944  C[8] = _mm_cvtsi64_si32(s1);
1945  s1 = _mm_srli_si64(s1, 32);
1946 
1947  s3 = _mm_add_si64(x18, y18);
1948  s4 = _mm_add_si64(x20, y20);
1949  s1 = _mm_add_si64(s1, z1);
1950  s3 = _mm_add_si64(s3, s4);
1951  s1 = _mm_add_si64(s1, z4);
1952  s1 = _mm_add_si64(s1, s3);
1953  C[9] = _mm_cvtsi64_si32(s1);
1954  s1 = _mm_srli_si64(s1, 32);
1955 
1956  s3 = _mm_add_si64(x22, y22);
1957  s4 = _mm_add_si64(x26, y26);
1958  s1 = _mm_add_si64(s1, z6);
1959  s3 = _mm_add_si64(s3, s4);
1960  s1 = _mm_add_si64(s1, z8);
1961  s1 = _mm_add_si64(s1, s3);
1962  C[10] = _mm_cvtsi64_si32(s1);
1963  s1 = _mm_srli_si64(s1, 32);
1964 
1965  s3 = _mm_add_si64(x27, y27);
1966  s1 = _mm_add_si64(s1, z10);
1967  s1 = _mm_add_si64(s1, z12);
1968  s1 = _mm_add_si64(s1, s3);
1969  C[11] = _mm_cvtsi64_si32(s1);
1970  s1 = _mm_srli_si64(s1, 32);
1971 
1972  s3 = _mm_add_si64(z14, z16);
1973  s1 = _mm_add_si64(s1, s3);
1974  C[12] = _mm_cvtsi64_si32(s1);
1975  s1 = _mm_srli_si64(s1, 32);
1976 
1977  s3 = _mm_add_si64(z18, z20);
1978  s1 = _mm_add_si64(s1, s3);
1979  C[13] = _mm_cvtsi64_si32(s1);
1980  s1 = _mm_srli_si64(s1, 32);
1981 
1982  s3 = _mm_add_si64(z22, z26);
1983  s1 = _mm_add_si64(s1, s3);
1984  C[14] = _mm_cvtsi64_si32(s1);
1985  s1 = _mm_srli_si64(s1, 32);
1986 
1987  C[15] = z[27] + _mm_cvtsi64_si32(s1);
1988  _mm_empty();
1989 }
1990 
1991 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1992 {
1993  __m128i temp[21];
1994  const word *w = (word *)temp;
1995  const __m64 *mw = (__m64 *)w;
1996  const word *x = (word *)temp+7*4;
1997  const __m64 *mx = (__m64 *)x;
1998  const word *y = (word *)temp+7*4*2;
1999  const __m64 *my = (__m64 *)y;
2000 
2001  P4_Mul(temp, (__m128i *)A, (__m128i *)B);
2002 
2003  P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
2004 
2005  P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
2006 
2007  C[0] = w[0];
2008 
2009  __m64 s1, s2, s3, s4;
2010 
2011  __m64 w1 = _mm_cvtsi32_si64(w[1]);
2012  __m64 w4 = mw[2];
2013  __m64 w6 = mw[3];
2014  __m64 w8 = mw[4];
2015  __m64 w10 = mw[5];
2016  __m64 w12 = mw[6];
2017  __m64 w14 = mw[7];
2018  __m64 w16 = mw[8];
2019  __m64 w18 = mw[9];
2020  __m64 w20 = mw[10];
2021  __m64 w22 = mw[11];
2022  __m64 w26 = _mm_cvtsi32_si64(w[26]);
2023 
2024  __m64 x0 = _mm_cvtsi32_si64(x[0]);
2025  __m64 x1 = _mm_cvtsi32_si64(x[1]);
2026  __m64 x4 = mx[2];
2027  __m64 x6 = mx[3];
2028  __m64 x8 = mx[4];
2029 
2030  __m64 y0 = _mm_cvtsi32_si64(y[0]);
2031  __m64 y1 = _mm_cvtsi32_si64(y[1]);
2032  __m64 y4 = my[2];
2033  __m64 y6 = my[3];
2034  __m64 y8 = my[4];
2035 
2036  s1 = _mm_add_si64(w1, w4);
2037  C[1] = _mm_cvtsi64_si32(s1);
2038  s1 = _mm_srli_si64(s1, 32);
2039 
2040  s2 = _mm_add_si64(w6, w8);
2041  s1 = _mm_add_si64(s1, s2);
2042  C[2] = _mm_cvtsi64_si32(s1);
2043  s1 = _mm_srli_si64(s1, 32);
2044 
2045  s2 = _mm_add_si64(w10, w12);
2046  s1 = _mm_add_si64(s1, s2);
2047  C[3] = _mm_cvtsi64_si32(s1);
2048  s1 = _mm_srli_si64(s1, 32);
2049 
2050  s3 = _mm_add_si64(x0, y0);
2051  s2 = _mm_add_si64(w14, w16);
2052  s1 = _mm_add_si64(s1, s3);
2053  s1 = _mm_add_si64(s1, s2);
2054  C[4] = _mm_cvtsi64_si32(s1);
2055  s1 = _mm_srli_si64(s1, 32);
2056 
2057  s3 = _mm_add_si64(x1, y1);
2058  s4 = _mm_add_si64(x4, y4);
2059  s1 = _mm_add_si64(s1, w18);
2060  s3 = _mm_add_si64(s3, s4);
2061  s1 = _mm_add_si64(s1, w20);
2062  s1 = _mm_add_si64(s1, s3);
2063  C[5] = _mm_cvtsi64_si32(s1);
2064  s1 = _mm_srli_si64(s1, 32);
2065 
2066  s3 = _mm_add_si64(x6, y6);
2067  s4 = _mm_add_si64(x8, y8);
2068  s1 = _mm_add_si64(s1, w22);
2069  s3 = _mm_add_si64(s3, s4);
2070  s1 = _mm_add_si64(s1, w26);
2071  s1 = _mm_add_si64(s1, s3);
2072  C[6] = _mm_cvtsi64_si32(s1);
2073  s1 = _mm_srli_si64(s1, 32);
2074 
2075  C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
2076  _mm_empty();
2077 }
2078 
2079 #endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2080 
2081 // end optimized
2082 
2083 // ********************************************************
2084 
2085 #define A0 A
2086 #define A1 (A+N2)
2087 #define B0 B
2088 #define B1 (B+N2)
2089 
2090 #define T0 T
2091 #define T1 (T+N2)
2092 #define T2 (T+N)
2093 #define T3 (T+N+N2)
2094 
2095 #define R0 R
2096 #define R1 (R+N2)
2097 #define R2 (R+N)
2098 #define R3 (R+N+N2)
2099 
2100 //VC60 workaround: compiler bug triggered without the extra dummy parameters
2101 
2102 // R[2*N] - result = A*B
2103 // T[2*N] - temporary work space
2104 // A[N] --- multiplier
2105 // B[N] --- multiplicant
2106 
2107 
2108 void RecursiveMultiply(word *R, word *T, const word *A, const word *B,
2109  unsigned int N)
2110 {
2111  if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
2112  LowLevel::Multiply8(R, A, B);
2113  else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
2114  LowLevel::Multiply4(R, A, B);
2115  else if (N==2)
2116  LowLevel::Multiply2(R, A, B);
2117  else
2118  {
2119  const unsigned int N2 = N/2;
2120  int carry;
2121 
2122  int aComp = Compare(A0, A1, N2);
2123  int bComp = Compare(B0, B1, N2);
2124 
2125  switch (2*aComp + aComp + bComp)
2126  {
2127  case -4:
2128  LowLevel::Subtract(R0, A1, A0, N2);
2129  LowLevel::Subtract(R1, B0, B1, N2);
2130  RecursiveMultiply(T0, T2, R0, R1, N2);
2131  LowLevel::Subtract(T1, T1, R0, N2);
2132  carry = -1;
2133  break;
2134  case -2:
2135  LowLevel::Subtract(R0, A1, A0, N2);
2136  LowLevel::Subtract(R1, B0, B1, N2);
2137  RecursiveMultiply(T0, T2, R0, R1, N2);
2138  carry = 0;
2139  break;
2140  case 2:
2141  LowLevel::Subtract(R0, A0, A1, N2);
2142  LowLevel::Subtract(R1, B1, B0, N2);
2143  RecursiveMultiply(T0, T2, R0, R1, N2);
2144  carry = 0;
2145  break;
2146  case 4:
2147  LowLevel::Subtract(R0, A1, A0, N2);
2148  LowLevel::Subtract(R1, B0, B1, N2);
2149  RecursiveMultiply(T0, T2, R0, R1, N2);
2150  LowLevel::Subtract(T1, T1, R1, N2);
2151  carry = -1;
2152  break;
2153  default:
2154  SetWords(T0, 0, N);
2155  carry = 0;
2156  }
2157 
2158  RecursiveMultiply(R0, T2, A0, B0, N2);
2159  RecursiveMultiply(R2, T2, A1, B1, N2);
2160 
2161  // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1
2162 
2163  carry += LowLevel::Add(T0, T0, R0, N);
2164  carry += LowLevel::Add(T0, T0, R2, N);
2165  carry += LowLevel::Add(R1, R1, T0, N);
2166 
2167  Increment(R3, N2, carry);
2168  }
2169 }
2170 
2171 
2172 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
2173 {
2174  if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
2175  LowLevel::Square4(R, A);
2176  else if (N==2)
2177  LowLevel::Square2(R, A);
2178  else
2179  {
2180  const unsigned int N2 = N/2;
2181 
2182  RecursiveSquare(R0, T2, A0, N2);
2183  RecursiveSquare(R2, T2, A1, N2);
2184  RecursiveMultiply(T0, T2, A0, A1, N2);
2185 
2186  word carry = LowLevel::Add(R1, R1, T0, N);
2187  carry += LowLevel::Add(R1, R1, T0, N);
2188  Increment(R3, N2, carry);
2189  }
2190 }
2191 
2192 
2193 // R[N] - bottom half of A*B
2194 // T[N] - temporary work space
2195 // A[N] - multiplier
2196 // B[N] - multiplicant
2197 
2198 
2199 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B,
2200  unsigned int N)
2201 {
2202  if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
2203  LowLevel::Multiply8Bottom(R, A, B);
2204  else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
2205  LowLevel::Multiply4Bottom(R, A, B);
2206  else if (N==2)
2207  LowLevel::Multiply2Bottom(R, A, B);
2208  else
2209  {
2210  const unsigned int N2 = N/2;
2211 
2212  RecursiveMultiply(R, T, A0, B0, N2);
2213  RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2214  LowLevel::Add(R1, R1, T0, N2);
2215  RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2216  LowLevel::Add(R1, R1, T0, N2);
2217  }
2218 }
2219 
2220 
2221 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A,
2222  const word *B, unsigned int N)
2223 {
2224  if (N==4)
2225  {
2226  LowLevel::Multiply4(T, A, B);
2227  memcpy(R, T+4, 4*WORD_SIZE);
2228  }
2229  else if (N==2)
2230  {
2231  LowLevel::Multiply2(T, A, B);
2232  memcpy(R, T+2, 2*WORD_SIZE);
2233  }
2234  else
2235  {
2236  const unsigned int N2 = N/2;
2237  int carry;
2238 
2239  int aComp = Compare(A0, A1, N2);
2240  int bComp = Compare(B0, B1, N2);
2241 
2242  switch (2*aComp + aComp + bComp)
2243  {
2244  case -4:
2245  LowLevel::Subtract(R0, A1, A0, N2);
2246  LowLevel::Subtract(R1, B0, B1, N2);
2247  RecursiveMultiply(T0, T2, R0, R1, N2);
2248  LowLevel::Subtract(T1, T1, R0, N2);
2249  carry = -1;
2250  break;
2251  case -2:
2252  LowLevel::Subtract(R0, A1, A0, N2);
2253  LowLevel::Subtract(R1, B0, B1, N2);
2254  RecursiveMultiply(T0, T2, R0, R1, N2);
2255  carry = 0;
2256  break;
2257  case 2:
2258  LowLevel::Subtract(R0, A0, A1, N2);
2259  LowLevel::Subtract(R1, B1, B0, N2);
2260  RecursiveMultiply(T0, T2, R0, R1, N2);
2261  carry = 0;
2262  break;
2263  case 4:
2264  LowLevel::Subtract(R0, A1, A0, N2);
2265  LowLevel::Subtract(R1, B0, B1, N2);
2266  RecursiveMultiply(T0, T2, R0, R1, N2);
2267  LowLevel::Subtract(T1, T1, R1, N2);
2268  carry = -1;
2269  break;
2270  default:
2271  SetWords(T0, 0, N);
2272  carry = 0;
2273  }
2274 
2275  RecursiveMultiply(T2, R0, A1, B1, N2);
2276 
2277  // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
2278 
2279  word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
2280  c2 += LowLevel::Subtract(R0, R0, T0, N2);
2281  word t = (Compare(R0, T2, N2) == -1);
2282 
2283  carry += t;
2284  carry += Increment(R0, N2, c2+t);
2285  carry += LowLevel::Add(R0, R0, T1, N2);
2286  carry += LowLevel::Add(R0, R0, T3, N2);
2287 
2288  CopyWords(R1, T3, N2);
2289  Increment(R1, N2, carry);
2290  }
2291 }
2292 
2293 
2294 inline word Add(word *C, const word *A, const word *B, unsigned int N)
2295 {
2296  return LowLevel::Add(C, A, B, N);
2297 }
2298 
2299 inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
2300 {
2301  return LowLevel::Subtract(C, A, B, N);
2302 }
2303 
2304 inline void Multiply(word *R, word *T, const word *A, const word *B,
2305  unsigned int N)
2306 {
2307  RecursiveMultiply(R, T, A, B, N);
2308 }
2309 
2310 inline void Square(word *R, word *T, const word *A, unsigned int N)
2311 {
2312  RecursiveSquare(R, T, A, N);
2313 }
2314 
2315 
2316 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA,
2317  const word *B, unsigned int NB)
2318 {
2319  if (NA == NB)
2320  {
2321  if (A == B)
2322  Square(R, T, A, NA);
2323  else
2324  Multiply(R, T, A, B, NA);
2325 
2326  return;
2327  }
2328 
2329  if (NA > NB)
2330  {
2331  STL::swap(A, B);
2332  STL::swap(NA, NB);
2333  }
2334 
2335  if (NA==2 && !A[1])
2336  {
2337  switch (A[0])
2338  {
2339  case 0:
2340  SetWords(R, 0, NB+2);
2341  return;
2342  case 1:
2343  CopyWords(R, B, NB);
2344  R[NB] = R[NB+1] = 0;
2345  return;
2346  default:
2347  R[NB] = LinearMultiply(R, B, A[0], NB);
2348  R[NB+1] = 0;
2349  return;
2350  }
2351  }
2352 
2353  Multiply(R, T, A, B, NA);
2354  CopyWords(T+2*NA, R+NA, NA);
2355 
2356  unsigned i;
2357 
2358  for (i=2*NA; i<NB; i+=2*NA)
2359  Multiply(T+NA+i, T, A, B+i, NA);
2360  for (i=NA; i<NB; i+=2*NA)
2361  Multiply(R+i, T, A, B+i, NA);
2362 
2363  if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2364  Increment(R+NB, NA);
2365 }
2366 
2367 
2368 void PositiveMultiply(Integer& product, const Integer& a, const Integer& b)
2369 {
2370  unsigned int aSize = RoundupSize(a.WordCount());
2371  unsigned int bSize = RoundupSize(b.WordCount());
2372 
2373  product.reg_.CleanNew(RoundupSize(aSize + bSize));
2374  product.sign_ = Integer::POSITIVE;
2375 
2376  AlignedWordBlock workspace(aSize + bSize);
2377  AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
2378  a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
2379 }
2380 
2381 void Multiply(Integer &product, const Integer &a, const Integer &b)
2382 {
2383  PositiveMultiply(product, a, b);
2384 
2385  if (a.NotNegative() != b.NotNegative())
2386  product.Negate();
2387 }
2388 
2389 
2390 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
2391 {
2392  while (N && X[N-2]==0 && X[N-1]==0)
2393  N-=2;
2394  return N;
2395 }
2396 
2397 
2398 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA,
2399  const word *M, unsigned int N)
2400 {
2401  word *b = T;
2402  word *c = T+N;
2403  word *f = T+2*N;
2404  word *g = T+3*N;
2405  unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
2406  unsigned int k=0, s=0;
2407 
2408  SetWords(T, 0, 3*N);
2409  b[0]=1;
2410  CopyWords(f, A, NA);
2411  CopyWords(g, M, N);
2412 
2413  while (1)
2414  {
2415  word t=f[0];
2416  while (!t)
2417  {
2418  if (EvenWordCount(f, fgLen)==0)
2419  {
2420  SetWords(R, 0, N);
2421  return 0;
2422  }
2423 
2424  ShiftWordsRightByWords(f, fgLen, 1);
2425  if (c[bcLen-1]) bcLen+=2;
2426  ShiftWordsLeftByWords(c, bcLen, 1);
2427  k+=WORD_BITS;
2428  t=f[0];
2429  }
2430 
2431  unsigned int i=0;
2432  while (t%2 == 0)
2433  {
2434  t>>=1;
2435  i++;
2436  }
2437  k+=i;
2438 
2439  if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2440  {
2441  if (s%2==0)
2442  CopyWords(R, b, N);
2443  else
2444  Subtract(R, M, b, N);
2445  return k;
2446  }
2447 
2448  ShiftWordsRightByBits(f, fgLen, i);
2449  t=ShiftWordsLeftByBits(c, bcLen, i);
2450  if (t)
2451  {
2452  c[bcLen] = t;
2453  bcLen+=2;
2454  }
2455 
2456  if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2457  fgLen-=2;
2458 
2459  if (Compare(f, g, fgLen)==-1)
2460  {
2461  STL::swap(f, g);
2462  STL::swap(b, c);
2463  s++;
2464  }
2465 
2466  Subtract(f, f, g, fgLen);
2467 
2468  if (Add(b, b, c, bcLen))
2469  {
2470  b[bcLen] = 1;
2471  bcLen+=2;
2472  }
2473  }
2474 }
2475 
2476 // R[N] - result = A/(2^k) mod M
2477 // A[N] - input
2478 // M[N] - modulus
2479 
2480 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2481  unsigned int N)
2482 {
2483  CopyWords(R, A, N);
2484 
2485  while (k--)
2486  {
2487  if (R[0]%2==0)
2488  ShiftWordsRightByBits(R, N, 1);
2489  else
2490  {
2491  word carry = Add(R, R, M, N);
2492  ShiftWordsRightByBits(R, N, 1);
2493  R[N-1] += carry<<(WORD_BITS-1);
2494  }
2495  }
2496 }
2497 
2498 // R[N] - result = A*(2^k) mod M
2499 // A[N] - input
2500 // M[N] - modulus
2501 
2502 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2503  unsigned int N)
2504 {
2505  CopyWords(R, A, N);
2506 
2507  while (k--)
2508  if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2509  Subtract(R, R, M, N);
2510 }
2511 
2512 
2513 // ********** end of integer needs
2514 
2515 
2516 Integer::Integer()
2517  : reg_(2), sign_(POSITIVE)
2518 {
2519  reg_[0] = reg_[1] = 0;
2520 }
2521 
2522 
2523 Integer::Integer(const Integer& t)
2524  : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
2525 {
2526  CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2527 }
2528 
2529 
2530 Integer::Integer(signed long value)
2531  : reg_(2)
2532 {
2533  if (value >= 0)
2534  sign_ = POSITIVE;
2535  else
2536  {
2537  sign_ = NEGATIVE;
2538  value = -value;
2539  }
2540  reg_[0] = word(value);
2541  reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
2542 }
2543 
2544 
2545 Integer::Integer(Sign s, word high, word low)
2546  : reg_(2), sign_(s)
2547 {
2548  reg_[0] = low;
2549  reg_[1] = high;
2550 }
2551 
2552 
2553 Integer::Integer(word value, unsigned int length)
2554  : reg_(RoundupSize(length)), sign_(POSITIVE)
2555 {
2556  reg_[0] = value;
2557  SetWords(reg_ + 1, 0, reg_.size() - 1);
2558 }
2559 
2560 
2561 Integer::Integer(const byte *encodedInteger, unsigned int byteCount,
2562  Signedness s)
2563 {
2564  Decode(encodedInteger, byteCount, s);
2565 }
2566 
2567 class BadBER {};
2568 
2569 // BER Decode Source
2570 Integer::Integer(Source& source)
2571  : reg_(2), sign_(POSITIVE)
2572 {
2573  Decode(source);
2574 }
2575 
2576 void Integer::Decode(Source& source)
2577 {
2578  byte b = source.next();
2579  if (b != INTEGER) {
2580  source.SetError(INTEGER_E);
2581  return;
2582  }
2583 
2584  word32 length = GetLength(source);
2585  if (length == 0 || source.GetError().What()) return;
2586 
2587  if ( (b = source.next()) == 0x00)
2588  length--;
2589  else
2590  source.prev();
2591 
2592  if (source.IsLeft(length) == false) return;
2593 
2594  unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
2595  words = RoundupSize(words);
2596  if (words > reg_.size()) reg_.CleanNew(words);
2597 
2598  for (int j = length; j > 0; j--) {
2599  b = source.next();
2600  reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8;
2601  }
2602 }
2603 
2604 
2605 void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s)
2606 {
2607  unsigned int idx(0);
2608  byte b = input[idx++];
2609  sign_ = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
2610 
2611  while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
2612  {
2613  inputLen--;
2614  b = input[idx++];
2615  }
2616 
2617  reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
2618 
2619  --idx;
2620  for (unsigned int i=inputLen; i > 0; i--)
2621  {
2622  b = input[idx++];
2623  reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8;
2624  }
2625 
2626  if (sign_ == NEGATIVE)
2627  {
2628  for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
2629  reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8;
2630  TwosComplement(reg_.get_buffer(), reg_.size());
2631  }
2632 }
2633 
2634 
2635 unsigned int Integer::Encode(byte* output, unsigned int outputLen,
2636  Signedness signedness) const
2637 {
2638  unsigned int idx(0);
2639  if (signedness == UNSIGNED || NotNegative())
2640  {
2641  for (unsigned int i=outputLen; i > 0; i--)
2642  output[idx++] = GetByte(i-1);
2643  }
2644  else
2645  {
2646  // take two's complement of *this
2647  Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this;
2648  for (unsigned i=0; i<outputLen; i++)
2649  output[idx++] = temp.GetByte(outputLen-i-1);
2650  }
2651  return outputLen;
2652 }
2653 
2654 
2655 static Integer* zero = 0;
2656 
2657 const Integer &Integer::Zero()
2658 {
2659  if (!zero)
2660  zero = NEW_TC Integer;
2661  return *zero;
2662 }
2663 
2664 
2665 static Integer* one = 0;
2666 
2667 const Integer &Integer::One()
2668 {
2669  if (!one)
2670  one = NEW_TC Integer(1,2);
2671  return *one;
2672 }
2673 
2674 
2675 // Clean up static singleton holders, not a leak, but helpful to have gone
2676 // when checking for leaks
2677 void CleanUp()
2678 {
2679  tcDelete(one);
2680  tcDelete(zero);
2681 
2682  // In case user calls more than once, prevent seg fault
2683  one = 0;
2684  zero = 0;
2685 }
2686 
2687 Integer::Integer(RandomNumberGenerator& rng, const Integer& min,
2688  const Integer& max)
2689 {
2690  Randomize(rng, min, max);
2691 }
2692 
2693 
2694 void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits)
2695 {
2696  const unsigned int nbytes = nbits/8 + 1;
2697  ByteBlock buf(nbytes);
2698  rng.GenerateBlock(buf.get_buffer(), nbytes);
2699  if (nbytes)
2700  buf[0] = (byte)Crop(buf[0], nbits % 8);
2701  Decode(buf.get_buffer(), nbytes, UNSIGNED);
2702 }
2703 
2704 void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min,
2705  const Integer& max)
2706 {
2707  Integer range = max - min;
2708  const unsigned int nbits = range.BitCount();
2709 
2710  do
2711  {
2712  Randomize(rng, nbits);
2713  }
2714  while (*this > range);
2715 
2716  *this += min;
2717 }
2718 
2719 
2720 Integer Integer::Power2(unsigned int e)
2721 {
2722  Integer r((word)0, BitsToWords(e + 1));
2723  r.SetBit(e);
2724  return r;
2725 }
2726 
2727 
2728 void Integer::SetBit(unsigned int n, bool value)
2729 {
2730  if (value)
2731  {
2732  reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
2733  reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS));
2734  }
2735  else
2736  {
2737  if (n / WORD_BITS < reg_.size())
2738  reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS));
2739  }
2740 }
2741 
2742 
2743 void Integer::SetByte(unsigned int n, byte value)
2744 {
2745  reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
2746  reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2747  reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2748 }
2749 
2750 
2751 void Integer::Negate()
2752 {
2753  if (!!(*this)) // don't flip sign if *this==0
2754  sign_ = Sign(1 - sign_);
2755 }
2756 
2757 
2758 bool Integer::operator!() const
2759 {
2760  return IsNegative() ? false : (reg_[0]==0 && WordCount()==0);
2761 }
2762 
2763 
2764 Integer& Integer::operator=(const Integer& t)
2765 {
2766  if (this != &t)
2767  {
2768  reg_.New(RoundupSize(t.WordCount()));
2769  CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2770  sign_ = t.sign_;
2771  }
2772  return *this;
2773 }
2774 
2775 
2776 Integer& Integer::operator+=(const Integer& t)
2777 {
2778  reg_.CleanGrow(t.reg_.size());
2779  if (NotNegative())
2780  {
2781  if (t.NotNegative())
2782  PositiveAdd(*this, *this, t);
2783  else
2784  PositiveSubtract(*this, *this, t);
2785  }
2786  else
2787  {
2788  if (t.NotNegative())
2789  PositiveSubtract(*this, t, *this);
2790  else
2791  {
2792  PositiveAdd(*this, *this, t);
2793  sign_ = Integer::NEGATIVE;
2794  }
2795  }
2796  return *this;
2797 }
2798 
2799 
2800 Integer Integer::operator-() const
2801 {
2802  Integer result(*this);
2803  result.Negate();
2804  return result;
2805 }
2806 
2807 
2808 Integer& Integer::operator-=(const Integer& t)
2809 {
2810  reg_.CleanGrow(t.reg_.size());
2811  if (NotNegative())
2812  {
2813  if (t.NotNegative())
2814  PositiveSubtract(*this, *this, t);
2815  else
2816  PositiveAdd(*this, *this, t);
2817  }
2818  else
2819  {
2820  if (t.NotNegative())
2821  {
2822  PositiveAdd(*this, *this, t);
2823  sign_ = Integer::NEGATIVE;
2824  }
2825  else
2826  PositiveSubtract(*this, t, *this);
2827  }
2828  return *this;
2829 }
2830 
2831 
2832 Integer& Integer::operator++()
2833 {
2834  if (NotNegative())
2835  {
2836  if (Increment(reg_.get_buffer(), reg_.size()))
2837  {
2838  reg_.CleanGrow(2*reg_.size());
2839  reg_[reg_.size()/2]=1;
2840  }
2841  }
2842  else
2843  {
2844  word borrow = Decrement(reg_.get_buffer(), reg_.size());
2845  (void)borrow; // shut up compiler
2846  if (WordCount()==0)
2847  *this = Zero();
2848  }
2849  return *this;
2850 }
2851 
2852 Integer& Integer::operator--()
2853 {
2854  if (IsNegative())
2855  {
2856  if (Increment(reg_.get_buffer(), reg_.size()))
2857  {
2858  reg_.CleanGrow(2*reg_.size());
2859  reg_[reg_.size()/2]=1;
2860  }
2861  }
2862  else
2863  {
2864  if (Decrement(reg_.get_buffer(), reg_.size()))
2865  *this = -One();
2866  }
2867  return *this;
2868 }
2869 
2870 
2871 Integer& Integer::operator<<=(unsigned int n)
2872 {
2873  const unsigned int wordCount = WordCount();
2874  const unsigned int shiftWords = n / WORD_BITS;
2875  const unsigned int shiftBits = n % WORD_BITS;
2876 
2877  reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
2878  ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
2879  shiftWords);
2880  ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
2881  shiftBits);
2882  return *this;
2883 }
2884 
2885 Integer& Integer::operator>>=(unsigned int n)
2886 {
2887  const unsigned int wordCount = WordCount();
2888  const unsigned int shiftWords = n / WORD_BITS;
2889  const unsigned int shiftBits = n % WORD_BITS;
2890 
2891  ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
2892  if (wordCount > shiftWords)
2893  ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
2894  shiftBits);
2895  if (IsNegative() && WordCount()==0) // avoid -0
2896  *this = Zero();
2897  return *this;
2898 }
2899 
2900 
2901 void PositiveAdd(Integer& sum, const Integer& a, const Integer& b)
2902 {
2903  word carry;
2904  if (a.reg_.size() == b.reg_.size())
2905  carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2906  b.reg_.get_buffer(), a.reg_.size());
2907  else if (a.reg_.size() > b.reg_.size())
2908  {
2909  carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2910  b.reg_.get_buffer(), b.reg_.size());
2911  CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
2912  a.reg_.size()-b.reg_.size());
2913  carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
2914  carry);
2915  }
2916  else
2917  {
2918  carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2919  b.reg_.get_buffer(), a.reg_.size());
2920  CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
2921  b.reg_.size()-a.reg_.size());
2922  carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
2923  carry);
2924  }
2925 
2926  if (carry)
2927  {
2928  sum.reg_.CleanGrow(2*sum.reg_.size());
2929  sum.reg_[sum.reg_.size()/2] = 1;
2930  }
2931  sum.sign_ = Integer::POSITIVE;
2932 }
2933 
2934 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
2935 {
2936  unsigned aSize = a.WordCount();
2937  aSize += aSize%2;
2938  unsigned bSize = b.WordCount();
2939  bSize += bSize%2;
2940 
2941  if (aSize == bSize)
2942  {
2943  if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
2944  {
2945  Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2946  b.reg_.get_buffer(), aSize);
2947  diff.sign_ = Integer::POSITIVE;
2948  }
2949  else
2950  {
2951  Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2952  a.reg_.get_buffer(), aSize);
2953  diff.sign_ = Integer::NEGATIVE;
2954  }
2955  }
2956  else if (aSize > bSize)
2957  {
2958  word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2959  b.reg_.get_buffer(), bSize);
2960  CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
2961  borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
2962  diff.sign_ = Integer::POSITIVE;
2963  }
2964  else
2965  {
2966  word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2967  a.reg_.get_buffer(), aSize);
2968  CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
2969  borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
2970  diff.sign_ = Integer::NEGATIVE;
2971  }
2972 }
2973 
2974 
2975 unsigned int Integer::MinEncodedSize(Signedness signedness) const
2976 {
2977  unsigned int outputLen = max(1U, ByteCount());
2978  if (signedness == UNSIGNED)
2979  return outputLen;
2980  if (NotNegative() && (GetByte(outputLen-1) & 0x80))
2981  outputLen++;
2982  if (IsNegative() && *this < -Power2(outputLen*8-1))
2983  outputLen++;
2984  return outputLen;
2985 }
2986 
2987 
2988 int Integer::Compare(const Integer& t) const
2989 {
2990  if (NotNegative())
2991  {
2992  if (t.NotNegative())
2993  return PositiveCompare(t);
2994  else
2995  return 1;
2996  }
2997  else
2998  {
2999  if (t.NotNegative())
3000  return -1;
3001  else
3002  return -PositiveCompare(t);
3003  }
3004 }
3005 
3006 
3007 int Integer::PositiveCompare(const Integer& t) const
3008 {
3009  unsigned size = WordCount(), tSize = t.WordCount();
3010 
3011  if (size == tSize)
3012  return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size);
3013  else
3014  return size > tSize ? 1 : -1;
3015 }
3016 
3017 
3018 bool Integer::GetBit(unsigned int n) const
3019 {
3020  if (n/WORD_BITS >= reg_.size())
3021  return 0;
3022  else
3023  return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3024 }
3025 
3026 
3027 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
3028 {
3029  unsigned long v = 0;
3030  for (unsigned int j=0; j<n; j++)
3031  v |= GetBit(i+j) << j;
3032  return v;
3033 }
3034 
3035 
3036 byte Integer::GetByte(unsigned int n) const
3037 {
3038  if (n/WORD_SIZE >= reg_.size())
3039  return 0;
3040  else
3041  return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3042 }
3043 
3044 
3045 unsigned int Integer::BitCount() const
3046 {
3047  unsigned wordCount = WordCount();
3048  if (wordCount)
3049  return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
3050  else
3051  return 0;
3052 }
3053 
3054 
3055 unsigned int Integer::ByteCount() const
3056 {
3057  unsigned wordCount = WordCount();
3058  if (wordCount)
3059  return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
3060  else
3061  return 0;
3062 }
3063 
3064 
3065 unsigned int Integer::WordCount() const
3066 {
3067  return CountWords(reg_.get_buffer(), reg_.size());
3068 }
3069 
3070 
3071 bool Integer::IsConvertableToLong() const
3072 {
3073  if (ByteCount() > sizeof(long))
3074  return false;
3075 
3076  unsigned long value = reg_[0];
3077  value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3078 
3079  if (sign_ == POSITIVE)
3080  return (signed long)value >= 0;
3081  else
3082  return -(signed long)value < 0;
3083 }
3084 
3085 
3086 signed long Integer::ConvertToLong() const
3087 {
3088  unsigned long value = reg_[0];
3089  value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3090  return sign_ == POSITIVE ? value : -(signed long)value;
3091 }
3092 
3093 
3094 void Integer::Swap(Integer& a)
3095 {
3096  reg_.Swap(a.reg_);
3097  STL::swap(sign_, a.sign_);
3098 }
3099 
3100 
3101 Integer Integer::Plus(const Integer& b) const
3102 {
3103  Integer sum((word)0, max(reg_.size(), b.reg_.size()));
3104  if (NotNegative())
3105  {
3106  if (b.NotNegative())
3107  PositiveAdd(sum, *this, b);
3108  else
3109  PositiveSubtract(sum, *this, b);
3110  }
3111  else
3112  {
3113  if (b.NotNegative())
3114  PositiveSubtract(sum, b, *this);
3115  else
3116  {
3117  PositiveAdd(sum, *this, b);
3118  sum.sign_ = Integer::NEGATIVE;
3119  }
3120  }
3121  return sum;
3122 }
3123 
3124 
3125 Integer Integer::Minus(const Integer& b) const
3126 {
3127  Integer diff((word)0, max(reg_.size(), b.reg_.size()));
3128  if (NotNegative())
3129  {
3130  if (b.NotNegative())
3131  PositiveSubtract(diff, *this, b);
3132  else
3133  PositiveAdd(diff, *this, b);
3134  }
3135  else
3136  {
3137  if (b.NotNegative())
3138  {
3139  PositiveAdd(diff, *this, b);
3140  diff.sign_ = Integer::NEGATIVE;
3141  }
3142  else
3143  PositiveSubtract(diff, b, *this);
3144  }
3145  return diff;
3146 }
3147 
3148 
3149 Integer Integer::Times(const Integer &b) const
3150 {
3151  Integer product;
3152  Multiply(product, *this, b);
3153  return product;
3154 }
3155 
3156 
3157 #undef A0
3158 #undef A1
3159 #undef B0
3160 #undef B1
3161 
3162 #undef T0
3163 #undef T1
3164 #undef T2
3165 #undef T3
3166 
3167 #undef R0
3168 #undef R1
3169 #undef R2
3170 #undef R3
3171 
3172 
3173 static inline void AtomicDivide(word *Q, const word *A, const word *B)
3174 {
3175  word T[4];
3176  DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
3177  DWord(A[2], A[3]), DWord(B[0], B[1]));
3178  Q[0] = q.GetLowHalf();
3179  Q[1] = q.GetHighHalf();
3180 
3181 #ifndef NDEBUG
3182  if (B[0] || B[1])
3183  {
3184  // multiply quotient and divisor and add remainder, make sure it
3185  // equals dividend
3186  word P[4];
3187  Portable::Multiply2(P, Q, B);
3188  Add(P, P, T, 4);
3189  }
3190 #endif
3191 }
3192 
3193 
3194 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
3195 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B,
3196  unsigned int N)
3197 {
3198  if (Q[1])
3199  {
3200  T[N] = T[N+1] = 0;
3201  unsigned i;
3202  for (i=0; i<N; i+=4)
3203  LowLevel::Multiply2(T+i, Q, B+i);
3204  for (i=2; i<N; i+=4)
3205  if (LowLevel::Multiply2Add(T+i, Q, B+i))
3206  T[i+5] += (++T[i+4]==0);
3207  }
3208  else
3209  {
3210  T[N] = LinearMultiply(T, B, Q[0], N);
3211  T[N+1] = 0;
3212  }
3213 
3214  word borrow = Subtract(R, R, T, N+2);
3215  (void)borrow; // shut up compiler
3216 
3217  while (R[N] || Compare(R, B, N) >= 0)
3218  {
3219  R[N] -= Subtract(R, R, B, N);
3220  Q[1] += (++Q[0]==0);
3221  }
3222 }
3223 
3224 // R[NB] -------- remainder = A%B
3225 // Q[NA-NB+2] --- quotient = A/B
3226 // T[NA+2*NB+4] - temp work space
3227 // A[NA] -------- dividend
3228 // B[NB] -------- divisor
3229 
3230 
3231 void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA,
3232  const word* B, unsigned int NB)
3233 {
3234  // set up temporary work space
3235  word *const TA=T;
3236  word *const TB=T+NA+2;
3237  word *const TP=T+NA+2+NB;
3238 
3239  // copy B into TB and normalize it so that TB has highest bit set to 1
3240  unsigned shiftWords = (B[NB-1]==0);
3241  TB[0] = TB[NB-1] = 0;
3242  CopyWords(TB+shiftWords, B, NB-shiftWords);
3243  unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
3244  ShiftWordsLeftByBits(TB, NB, shiftBits);
3245 
3246  // copy A into TA and normalize it
3247  TA[0] = TA[NA] = TA[NA+1] = 0;
3248  CopyWords(TA+shiftWords, A, NA);
3249  ShiftWordsLeftByBits(TA, NA+2, shiftBits);
3250 
3251  if (TA[NA+1]==0 && TA[NA] <= 1)
3252  {
3253  Q[NA-NB+1] = Q[NA-NB] = 0;
3254  while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
3255  {
3256  TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
3257  ++Q[NA-NB];
3258  }
3259  }
3260  else
3261  {
3262  NA+=2;
3263  }
3264 
3265  word BT[2];
3266  BT[0] = TB[NB-2] + 1;
3267  BT[1] = TB[NB-1] + (BT[0]==0);
3268 
3269  // start reducing TA mod TB, 2 words at a time
3270  for (unsigned i=NA-2; i>=NB; i-=2)
3271  {
3272  AtomicDivide(Q+i-NB, TA+i-2, BT);
3273  CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
3274  }
3275 
3276  // copy TA into R, and denormalize it
3277  CopyWords(R, TA+shiftWords, NB);
3278  ShiftWordsRightByBits(R, NB, shiftBits);
3279 }
3280 
3281 
3282 void PositiveDivide(Integer& remainder, Integer& quotient,
3283  const Integer& a, const Integer& b)
3284 {
3285  unsigned aSize = a.WordCount();
3286  unsigned bSize = b.WordCount();
3287 
3288  if (a.PositiveCompare(b) == -1)
3289  {
3290  remainder = a;
3291  remainder.sign_ = Integer::POSITIVE;
3292  quotient = Integer::Zero();
3293  return;
3294  }
3295 
3296  aSize += aSize%2; // round up to next even number
3297  bSize += bSize%2;
3298 
3299  remainder.reg_.CleanNew(RoundupSize(bSize));
3300  remainder.sign_ = Integer::POSITIVE;
3301  quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
3302  quotient.sign_ = Integer::POSITIVE;
3303 
3304  AlignedWordBlock T(aSize+2*bSize+4);
3305  Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
3306  T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
3307  bSize);
3308 }
3309 
3310 void Integer::Divide(Integer &remainder, Integer &quotient,
3311  const Integer &dividend, const Integer &divisor)
3312 {
3313  PositiveDivide(remainder, quotient, dividend, divisor);
3314 
3315  if (dividend.IsNegative())
3316  {
3317  quotient.Negate();
3318  if (remainder.NotZero())
3319  {
3320  --quotient;
3321  remainder = divisor.AbsoluteValue() - remainder;
3322  }
3323  }
3324 
3325  if (divisor.IsNegative())
3326  quotient.Negate();
3327 }
3328 
3329 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a,
3330  unsigned int n)
3331 {
3332  q = a;
3333  q >>= n;
3334 
3335  const unsigned int wordCount = BitsToWords(n);
3336  if (wordCount <= a.WordCount())
3337  {
3338  r.reg_.resize(RoundupSize(wordCount));
3339  CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
3340  SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
3341  if (n % WORD_BITS != 0)
3342  r.reg_[wordCount-1] %= (word(1) << (n % WORD_BITS));
3343  }
3344  else
3345  {
3346  r.reg_.resize(RoundupSize(a.WordCount()));
3347  CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
3348  }
3349  r.sign_ = POSITIVE;
3350 
3351  if (a.IsNegative() && r.NotZero())
3352  {
3353  --q;
3354  r = Power2(n) - r;
3355  }
3356 }
3357 
3358 Integer Integer::DividedBy(const Integer &b) const
3359 {
3360  Integer remainder, quotient;
3361  Integer::Divide(remainder, quotient, *this, b);
3362  return quotient;
3363 }
3364 
3365 Integer Integer::Modulo(const Integer &b) const
3366 {
3367  Integer remainder, quotient;
3368  Integer::Divide(remainder, quotient, *this, b);
3369  return remainder;
3370 }
3371 
3372 void Integer::Divide(word &remainder, Integer &quotient,
3373  const Integer &dividend, word divisor)
3374 {
3375  if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3376  {
3377  quotient = dividend >> (BitPrecision(divisor)-1);
3378  remainder = dividend.reg_[0] & (divisor-1);
3379  return;
3380  }
3381 
3382  unsigned int i = dividend.WordCount();
3383  quotient.reg_.CleanNew(RoundupSize(i));
3384  remainder = 0;
3385  while (i--)
3386  {
3387  quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor;
3388  remainder = DWord(dividend.reg_[i], remainder) % divisor;
3389  }
3390 
3391  if (dividend.NotNegative())
3392  quotient.sign_ = POSITIVE;
3393  else
3394  {
3395  quotient.sign_ = NEGATIVE;
3396  if (remainder)
3397  {
3398  --quotient;
3399  remainder = divisor - remainder;
3400  }
3401  }
3402 }
3403 
3404 Integer Integer::DividedBy(word b) const
3405 {
3406  word remainder;
3407  Integer quotient;
3408  Integer::Divide(remainder, quotient, *this, b);
3409  return quotient;
3410 }
3411 
3412 word Integer::Modulo(word divisor) const
3413 {
3414  word remainder;
3415 
3416  if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3417  remainder = reg_[0] & (divisor-1);
3418  else
3419  {
3420  unsigned int i = WordCount();
3421 
3422  if (divisor <= 5)
3423  {
3424  DWord sum(0, 0);
3425  while (i--)
3426  sum += reg_[i];
3427  remainder = sum % divisor;
3428  }
3429  else
3430  {
3431  remainder = 0;
3432  while (i--)
3433  remainder = DWord(reg_[i], remainder) % divisor;
3434  }
3435  }
3436 
3437  if (IsNegative() && remainder)
3438  remainder = divisor - remainder;
3439 
3440  return remainder;
3441 }
3442 
3443 
3444 Integer Integer::AbsoluteValue() const
3445 {
3446  Integer result(*this);
3447  result.sign_ = POSITIVE;
3448  return result;
3449 }
3450 
3451 
3452 Integer Integer::SquareRoot() const
3453 {
3454  if (!IsPositive())
3455  return Zero();
3456 
3457  // overestimate square root
3458  Integer x, y = Power2((BitCount()+1)/2);
3459 
3460  do
3461  {
3462  x = y;
3463  y = (x + *this/x) >> 1;
3464  } while (y<x);
3465 
3466  return x;
3467 }
3468 
3469 bool Integer::IsSquare() const
3470 {
3471  Integer r = SquareRoot();
3472  return *this == r.Squared();
3473 }
3474 
3475 bool Integer::IsUnit() const
3476 {
3477  return (WordCount() == 1) && (reg_[0] == 1);
3478 }
3479 
3480 Integer Integer::MultiplicativeInverse() const
3481 {
3482  return IsUnit() ? *this : Zero();
3483 }
3484 
3485 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3486 {
3487  return x*y%m;
3488 }
3489 
3490 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3491 {
3492  ModularArithmetic mr(m);
3493  return mr.Exponentiate(x, e);
3494 }
3495 
3496 Integer Integer::Gcd(const Integer &a, const Integer &b)
3497 {
3498  return EuclideanDomainOf().Gcd(a, b);
3499 }
3500 
3501 Integer Integer::InverseMod(const Integer &m) const
3502 {
3503  if (IsNegative() || *this>=m)
3504  return (*this%m).InverseMod(m);
3505 
3506  if (m.IsEven())
3507  {
3508  if (!m || IsEven())
3509  return Zero(); // no inverse
3510  if (*this == One())
3511  return One();
3512 
3513  Integer u = m.InverseMod(*this);
3514  return !u ? Zero() : (m*(*this-u)+1)/(*this);
3515  }
3516 
3517  AlignedWordBlock T(m.reg_.size() * 4);
3518  Integer r((word)0, m.reg_.size());
3519  unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
3520  reg_.get_buffer(), reg_.size(),
3521  m.reg_.get_buffer(), m.reg_.size());
3522  DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
3523  m.reg_.get_buffer(), m.reg_.size());
3524  return r;
3525 }
3526 
3527 word Integer::InverseMod(const word mod) const
3528 {
3529  word g0 = mod, g1 = *this % mod;
3530  word v0 = 0, v1 = 1;
3531  word y;
3532 
3533  while (g1)
3534  {
3535  if (g1 == 1)
3536  return v1;
3537  y = g0 / g1;
3538  g0 = g0 % g1;
3539  v0 += y * v1;
3540 
3541  if (!g0)
3542  break;
3543  if (g0 == 1)
3544  return mod-v0;
3545  y = g1 / g0;
3546  g1 = g1 % g0;
3547  v1 += y * v0;
3548  }
3549  return 0;
3550 }
3551 
3552 // ********* ModArith stuff
3553 
3554 const Integer& ModularArithmetic::Half(const Integer &a) const
3555 {
3556  if (a.reg_.size()==modulus.reg_.size())
3557  {
3558  TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
3559  modulus.reg_.begin(), a.reg_.size());
3560  return result;
3561  }
3562  else
3563  return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
3564 }
3565 
3566 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
3567 {
3568  if (a.reg_.size()==modulus.reg_.size() &&
3569  b.reg_.size()==modulus.reg_.size())
3570  {
3571  if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
3572  a.reg_.size())
3573  || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
3574  a.reg_.size()) >= 0)
3575  {
3576  TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3577  modulus.reg_.begin(), a.reg_.size());
3578  }
3579  return result;
3580  }
3581  else
3582  {
3583  result1 = a+b;
3584  if (result1 >= modulus)
3585  result1 -= modulus;
3586  return result1;
3587  }
3588 }
3589 
3590 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
3591 {
3592  if (a.reg_.size()==modulus.reg_.size() &&
3593  b.reg_.size()==modulus.reg_.size())
3594  {
3595  if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3596  b.reg_.get_buffer(), a.reg_.size())
3597  || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
3598  a.reg_.size()) >= 0)
3599  {
3600  TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3601  modulus.reg_.get_buffer(), a.reg_.size());
3602  }
3603  }
3604  else
3605  {
3606  a+=b;
3607  if (a>=modulus)
3608  a-=modulus;
3609  }
3610 
3611  return a;
3612 }
3613 
3614 const Integer& ModularArithmetic::Subtract(const Integer &a,
3615  const Integer &b) const
3616 {
3617  if (a.reg_.size()==modulus.reg_.size() &&
3618  b.reg_.size()==modulus.reg_.size())
3619  {
3620  if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
3621  b.reg_.begin(), a.reg_.size()))
3622  TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
3623  modulus.reg_.begin(), a.reg_.size());
3624  return result;
3625  }
3626  else
3627  {
3628  result1 = a-b;
3629  if (result1.IsNegative())
3630  result1 += modulus;
3631  return result1;
3632  }
3633 }
3634 
3635 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
3636 {
3637  if (a.reg_.size()==modulus.reg_.size() &&
3638  b.reg_.size()==modulus.reg_.size())
3639  {
3640  if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3641  b.reg_.get_buffer(), a.reg_.size()))
3642  TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3643  modulus.reg_.get_buffer(), a.reg_.size());
3644  }
3645  else
3646  {
3647  a-=b;
3648  if (a.IsNegative())
3649  a+=modulus;
3650  }
3651 
3652  return a;
3653 }
3654 
3655 const Integer& ModularArithmetic::Inverse(const Integer &a) const
3656 {
3657  if (!a)
3658  return a;
3659 
3660  CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
3661  if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3662  a.reg_.begin(), a.reg_.size()))
3663  Decrement(result.reg_.begin()+a.reg_.size(), 1,
3664  modulus.reg_.size()-a.reg_.size());
3665 
3666  return result;
3667 }
3668 
3669 Integer ModularArithmetic::CascadeExponentiate(const Integer &x,
3670  const Integer &e1, const Integer &y, const Integer &e2) const
3671 {
3672  if (modulus.IsOdd())
3673  {
3674  MontgomeryRepresentation dr(modulus);
3675  return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
3676  dr.ConvertIn(y), e2));
3677  }
3678  else
3679  return AbstractRing::CascadeExponentiate(x, e1, y, e2);
3680 }
3681 
3682 void ModularArithmetic::SimultaneousExponentiate(Integer *results,
3683  const Integer &base, const Integer *exponents,
3684  unsigned int exponentsCount) const
3685 {
3686  if (modulus.IsOdd())
3687  {
3688  MontgomeryRepresentation dr(modulus);
3689  dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
3690  exponentsCount);
3691  for (unsigned int i=0; i<exponentsCount; i++)
3692  results[i] = dr.ConvertOut(results[i]);
3693  }
3694  else
3695  AbstractRing::SimultaneousExponentiate(results, base,
3696  exponents, exponentsCount);
3697 }
3698 
3699 
3700 // ********************************************************
3701 
3702 #define A0 A
3703 #define A1 (A+N2)
3704 #define B0 B
3705 #define B1 (B+N2)
3706 
3707 #define T0 T
3708 #define T1 (T+N2)
3709 #define T2 (T+N)
3710 #define T3 (T+N+N2)
3711 
3712 #define R0 R
3713 #define R1 (R+N2)
3714 #define R2 (R+N)
3715 #define R3 (R+N+N2)
3716 
3717 
3718 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B,
3719  unsigned int N)
3720 {
3721  RecursiveMultiplyBottom(R, T, A, B, N);
3722 }
3723 
3724 inline void MultiplyTop(word *R, word *T, const word *L, const word *A,
3725  const word *B, unsigned int N)
3726 {
3727  RecursiveMultiplyTop(R, T, L, A, B, N);
3728 }
3729 
3730 
3731 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
3732 // T[3*N] - temporary work space
3733 // X[2*N] - number to be reduced
3734 // M[N] --- modulus
3735 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
3736 
3737 void MontgomeryReduce(word *R, word *T, const word *X, const word *M,
3738  const word *U, unsigned int N)
3739 {
3740  MultiplyBottom(R, T, X, U, N);
3741  MultiplyTop(T, T+N, X, R, M, N);
3742  word borrow = Subtract(T, X+N, T, N);
3743  // defend against timing attack by doing this Add even when not needed
3744  word carry = Add(T+N, T, M, N);
3745  (void)carry; // shut up compiler
3746  CopyWords(R, T + (borrow ? N : 0), N);
3747 }
3748 
3749 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
3750 // T[3*N/2] - temporary work space
3751 // A[N] ----- an odd number as input
3752 
3753 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
3754 {
3755  if (N==2)
3756  {
3757  T[0] = AtomicInverseModPower2(A[0]);
3758  T[1] = 0;
3759  LowLevel::Multiply2Bottom(T+2, T, A);
3760  TwosComplement(T+2, 2);
3761  Increment(T+2, 2, 2);
3762  LowLevel::Multiply2Bottom(R, T, T+2);
3763  }
3764  else
3765  {
3766  const unsigned int N2 = N/2;
3767  RecursiveInverseModPower2(R0, T0, A0, N2);
3768  T0[0] = 1;
3769  SetWords(T0+1, 0, N2-1);
3770  MultiplyTop(R1, T1, T0, R0, A0, N2);
3771  MultiplyBottom(T0, T1, R0, A1, N2);
3772  Add(T0, R1, T0, N2);
3773  TwosComplement(T0, N2);
3774  MultiplyBottom(R1, T1, R0, T0, N2);
3775  }
3776 }
3777 
3778 
3779 #undef A0
3780 #undef A1
3781 #undef B0
3782 #undef B1
3783 
3784 #undef T0
3785 #undef T1
3786 #undef T2
3787 #undef T3
3788 
3789 #undef R0
3790 #undef R1
3791 #undef R2
3792 #undef R3
3793 
3794 
3795 // modulus must be odd
3796 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)
3797  : ModularArithmetic(m),
3798  u((word)0, modulus.reg_.size()),
3799  workspace(5*modulus.reg_.size())
3800 {
3801  RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
3802  modulus.reg_.get_buffer(), modulus.reg_.size());
3803 }
3804 
3805 const Integer& MontgomeryRepresentation::Multiply(const Integer &a,
3806  const Integer &b) const
3807 {
3808  word *const T = workspace.begin();
3809  word *const R = result.reg_.begin();
3810  const unsigned int N = modulus.reg_.size();
3811 
3812  AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
3813  b.reg_.get_buffer(), b.reg_.size());
3814  SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
3815  MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3816  u.reg_.get_buffer(), N);
3817  return result;
3818 }
3819 
3820 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
3821 {
3822  word *const T = workspace.begin();
3823  word *const R = result.reg_.begin();
3824  const unsigned int N = modulus.reg_.size();
3825 
3826  TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
3827  SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
3828  MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3829  u.reg_.get_buffer(), N);
3830  return result;
3831 }
3832 
3833 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
3834 {
3835  word *const T = workspace.begin();
3836  word *const R = result.reg_.begin();
3837  const unsigned int N = modulus.reg_.size();
3838 
3839  CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3840  SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3841  MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3842  u.reg_.get_buffer(), N);
3843  return result;
3844 }
3845 
3846 const Integer& MontgomeryRepresentation::MultiplicativeInverse(
3847  const Integer &a) const
3848 {
3849 // return (EuclideanMultiplicativeInverse(a, modulus)<<
3850 // (2*WORD_BITS*modulus.reg_.size()))%modulus;
3851  word *const T = workspace.begin();
3852  word *const R = result.reg_.begin();
3853  const unsigned int N = modulus.reg_.size();
3854 
3855  CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3856  SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3857  MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3858  u.reg_.get_buffer(), N);
3859  unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
3860 
3861 // cout << "k=" << k << " N*32=" << 32*N << endl;
3862 
3863  if (k>N*WORD_BITS)
3864  DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
3865  else
3866  MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
3867 
3868  return result;
3869 }
3870 
3871 
3872 // mod Root stuff
3873 Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq,
3874  const Integer &p, const Integer &q, const Integer &u)
3875 {
3876  Integer p2 = ModularExponentiation((a % p), dp, p);
3877  Integer q2 = ModularExponentiation((a % q), dq, q);
3878  return CRT(p2, p, q2, q, u);
3879 }
3880 
3881 Integer CRT(const Integer &xp, const Integer &p, const Integer &xq,
3882  const Integer &q, const Integer &u)
3883 {
3884  // isn't operator overloading great?
3885  return p * (u * (xq-xp) % q) + xp;
3886 }
3887 
3888 
3889 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
3890 #ifndef TAOCRYPT_NATIVE_DWORD_AVAILABLE
3891 template hword DivideThreeWordsByTwo<hword, Word>(hword*, hword, hword, Word*);
3892 #endif
3893 template word DivideThreeWordsByTwo<word, DWord>(word*, word, word, DWord*);
3894 #ifdef SSE2_INTRINSICS_AVAILABLE
3895 template class AlignedAllocator<word>;
3896 #endif
3897 #endif
3898 
3899 
3900 } // namespace
3901