monotone

monotone Mtn Source Tree

Root/cryptopp/integer.cpp

1// integer.cpp - written and placed in the public domain by Wei Dai
2// contains public domain code contributed by Alister Lee and Leonard Janke
3
4#include "pch.h"
5#include "integer.h"
6#include "modarith.h"
7#include "nbtheory.h"
8#include "asn.h"
9#include "oids.h"
10#include "words.h"
11#include "algparam.h"
12#include "pubkey.h"// for P1363_KDF2
13#include "sha.h"
14
15#include <iostream>
16
17#ifdef SSE2_INTRINSICS_AVAILABLE
18#include <emmintrin.h>
19#endif
20
21#include "algebra.cpp"
22#include "eprecomp.cpp"
23
24NAMESPACE_BEGIN(CryptoPP)
25
26bool FunctionAssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
27{
28if (valueType != typeid(Integer))
29return false;
30*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
31return true;
32}
33
34static int DummyAssignIntToInteger = (AssignIntToInteger = FunctionAssignIntToInteger, 0);
35
36#ifdef SSE2_INTRINSICS_AVAILABLE
37template <class T>
38AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *)
39{
40if (n < 4)
41return new T[n];
42else
43return (T *)_mm_malloc(sizeof(T)*n, 16);
44
45}
46
47template <class T>
48void AlignedAllocator<T>::deallocate(void *p, size_type n)
49{
50memset(p, 0, n*sizeof(T));
51if (n < 4)
52delete [] p;
53else
54_mm_free(p);
55}
56
57template class AlignedAllocator<word>;
58#endif
59
60#define MAKE_DWORD(lowWord, highWord) ((dword(highWord)<<WORD_BITS) | (lowWord))
61
62static int Compare(const word *A, const word *B, unsigned int N)
63{
64while (N--)
65if (A[N] > B[N])
66return 1;
67else if (A[N] < B[N])
68return -1;
69
70return 0;
71}
72
73static word Increment(word *A, unsigned int N, word B=1)
74{
75assert(N);
76word t = A[0];
77A[0] = t+B;
78if (A[0] >= t)
79return 0;
80for (unsigned i=1; i<N; i++)
81if (++A[i])
82return 0;
83return 1;
84}
85
86static word Decrement(word *A, unsigned int N, word B=1)
87{
88assert(N);
89word t = A[0];
90A[0] = t-B;
91if (A[0] <= t)
92return 0;
93for (unsigned i=1; i<N; i++)
94if (A[i]--)
95return 0;
96return 1;
97}
98
99static void TwosComplement(word *A, unsigned int N)
100{
101Decrement(A, N);
102for (unsigned i=0; i<N; i++)
103A[i] = ~A[i];
104}
105
106static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
107{
108word carry=0;
109for(unsigned i=0; i<N; i++)
110{
111dword p = (dword)A[i] * B + carry;
112C[i] = LOW_WORD(p);
113carry = HIGH_WORD(p);
114}
115return carry;
116}
117
118static void AtomicInverseModPower2(word *C, word A0, word A1)
119{
120assert(A0%2==1);
121
122dword A=MAKE_DWORD(A0, A1), R=A0%8;
123
124for (unsigned i=3; i<2*WORD_BITS; i*=2)
125R = R*(2-R*A);
126
127assert(R*A==1);
128
129C[0] = LOW_WORD(R);
130C[1] = HIGH_WORD(R);
131}
132
133// ********************************************************
134
135class Portable
136{
137public:
138static word Add(word *C, const word *A, const word *B, unsigned int N);
139static word Subtract(word *C, const word *A, const word *B, unsigned int N);
140
141static inline void Multiply2(word *C, const word *A, const word *B);
142static inline word Multiply2Add(word *C, const word *A, const word *B);
143static void Multiply4(word *C, const word *A, const word *B);
144static void Multiply8(word *C, const word *A, const word *B);
145static inline unsigned int MultiplyRecursionLimit() {return 8;}
146
147static inline void Multiply2Bottom(word *C, const word *A, const word *B);
148static void Multiply4Bottom(word *C, const word *A, const word *B);
149static void Multiply8Bottom(word *C, const word *A, const word *B);
150static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
151
152static void Square2(word *R, const word *A);
153static void Square4(word *R, const word *A);
154static void Square8(word *R, const word *A) {assert(false);}
155static inline unsigned int SquareRecursionLimit() {return 4;}
156};
157
158word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
159{
160assert (N%2 == 0);
161
162#ifdef IS_LITTLE_ENDIAN
163if (sizeof(dword) == sizeof(size_t))// dword is only register size
164{
165dword carry = 0;
166N >>= 1;
167for (unsigned int i = 0; i < N; i++)
168{
169dword a = ((const dword *)A)[i] + carry;
170dword c = a + ((const dword *)B)[i];
171((dword *)C)[i] = c;
172carry = (a < carry) | (c < a);
173}
174return (word)carry;
175}
176else
177#endif
178{
179word carry = 0;
180for (unsigned int i = 0; i < N; i+=2)
181{
182dword u = (dword) carry + A[i] + B[i];
183C[i] = LOW_WORD(u);
184u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
185C[i+1] = LOW_WORD(u);
186carry = HIGH_WORD(u);
187}
188return carry;
189}
190}
191
192word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
193{
194assert (N%2 == 0);
195
196#ifdef IS_LITTLE_ENDIAN
197if (sizeof(dword) == sizeof(size_t))// dword is only register size
198{
199dword borrow = 0;
200N >>= 1;
201for (unsigned int i = 0; i < N; i++)
202{
203dword a = ((const dword *)A)[i];
204dword b = a - borrow;
205dword c = b - ((const dword *)B)[i];
206((dword *)C)[i] = c;
207borrow = (b > a) | (c > b);
208}
209return (word)borrow;
210}
211else
212#endif
213{
214word borrow=0;
215for (unsigned i = 0; i < N; i+=2)
216{
217dword u = (dword) A[i] - B[i] - borrow;
218C[i] = LOW_WORD(u);
219u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
220C[i+1] = LOW_WORD(u);
221borrow = 0-HIGH_WORD(u);
222}
223return borrow;
224}
225}
226
227void Portable::Multiply2(word *C, const word *A, const word *B)
228{
229/*
230word s;
231dword d;
232
233if (A1 >= A0)
234if (B0 >= B1)
235{
236s = 0;
237d = (dword)(A1-A0)*(B0-B1);
238}
239else
240{
241s = (A1-A0);
242d = (dword)s*(word)(B0-B1);
243}
244else
245if (B0 > B1)
246{
247s = (B0-B1);
248d = (word)(A1-A0)*(dword)s;
249}
250else
251{
252s = 0;
253d = (dword)(A0-A1)*(B1-B0);
254}
255*/
256// this segment is the branchless equivalent of above
257word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
258unsigned int ai = A[1] < A[0];
259unsigned int bi = B[0] < B[1];
260unsigned int di = ai & bi;
261dword d = (dword)D[di]*D[di+2];
262D[1] = D[3] = 0;
263unsigned int si = ai + !bi;
264word s = D[si];
265
266dword A0B0 = (dword)A[0]*B[0];
267C[0] = LOW_WORD(A0B0);
268
269dword A1B1 = (dword)A[1]*B[1];
270dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
271C[1] = LOW_WORD(t);
272
273t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
274C[2] = LOW_WORD(t);
275C[3] = HIGH_WORD(t);
276}
277
278inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
279{
280#ifdef IS_LITTLE_ENDIAN
281if (sizeof(dword) == sizeof(size_t))
282{
283dword a = *(const dword *)A, b = *(const dword *)B;
284((dword *)C)[0] = a*b;
285}
286else
287#endif
288{
289dword t = (dword)A[0]*B[0];
290C[0] = LOW_WORD(t);
291C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
292}
293}
294
295word Portable::Multiply2Add(word *C, const word *A, const word *B)
296{
297word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
298unsigned int ai = A[1] < A[0];
299unsigned int bi = B[0] < B[1];
300unsigned int di = ai & bi;
301dword d = (dword)D[di]*D[di+2];
302D[1] = D[3] = 0;
303unsigned int si = ai + !bi;
304word s = D[si];
305
306dword A0B0 = (dword)A[0]*B[0];
307dword t = A0B0 + C[0];
308C[0] = LOW_WORD(t);
309
310dword A1B1 = (dword)A[1]*B[1];
311t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
312C[1] = LOW_WORD(t);
313
314t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
315C[2] = LOW_WORD(t);
316
317t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
318C[3] = LOW_WORD(t);
319return HIGH_WORD(t);
320}
321
322#define MulAcc(x, y)\
323p = (dword)A[x] * B[y] + c; \
324c = LOW_WORD(p);\
325p = (dword)d + HIGH_WORD(p);\
326d = LOW_WORD(p);\
327e += HIGH_WORD(p);
328
329#define SaveMulAcc(s, x, y) \
330R[s] = c;\
331p = (dword)A[x] * B[y] + d; \
332c = LOW_WORD(p);\
333p = (dword)e + HIGH_WORD(p);\
334d = LOW_WORD(p);\
335e = HIGH_WORD(p);
336
337#define SquAcc(x, y)\
338q = (dword)A[x] * A[y];\
339p = q + c; \
340c = LOW_WORD(p);\
341p = (dword)d + HIGH_WORD(p);\
342d = LOW_WORD(p);\
343e += HIGH_WORD(p);\
344p = q + c; \
345c = LOW_WORD(p);\
346p = (dword)d + HIGH_WORD(p);\
347d = LOW_WORD(p);\
348e += HIGH_WORD(p);
349
350#define SaveSquAcc(s, x, y) \
351R[s] = c;\
352q = (dword)A[x] * A[y];\
353p = q + d; \
354c = LOW_WORD(p);\
355p = (dword)e + HIGH_WORD(p);\
356d = LOW_WORD(p);\
357e = HIGH_WORD(p);\
358p = q + c; \
359c = LOW_WORD(p);\
360p = (dword)d + HIGH_WORD(p);\
361d = LOW_WORD(p);\
362e += HIGH_WORD(p);
363
364void Portable::Multiply4(word *R, const word *A, const word *B)
365{
366dword p;
367word c, d, e;
368
369p = (dword)A[0] * B[0];
370R[0] = LOW_WORD(p);
371c = HIGH_WORD(p);
372d = e = 0;
373
374MulAcc(0, 1);
375MulAcc(1, 0);
376
377SaveMulAcc(1, 2, 0);
378MulAcc(1, 1);
379MulAcc(0, 2);
380
381SaveMulAcc(2, 0, 3);
382MulAcc(1, 2);
383MulAcc(2, 1);
384MulAcc(3, 0);
385
386SaveMulAcc(3, 3, 1);
387MulAcc(2, 2);
388MulAcc(1, 3);
389
390SaveMulAcc(4, 2, 3);
391MulAcc(3, 2);
392
393R[5] = c;
394p = (dword)A[3] * B[3] + d;
395R[6] = LOW_WORD(p);
396R[7] = e + HIGH_WORD(p);
397}
398
399void Portable::Square2(word *R, const word *A)
400{
401dword p, q;
402word c, d, e;
403
404p = (dword)A[0] * A[0];
405R[0] = LOW_WORD(p);
406c = HIGH_WORD(p);
407d = e = 0;
408
409SquAcc(0, 1);
410
411R[1] = c;
412p = (dword)A[1] * A[1] + d;
413R[2] = LOW_WORD(p);
414R[3] = e + HIGH_WORD(p);
415}
416
417void Portable::Square4(word *R, const word *A)
418{
419const word *B = A;
420dword p, q;
421word c, d, e;
422
423p = (dword)A[0] * A[0];
424R[0] = LOW_WORD(p);
425c = HIGH_WORD(p);
426d = e = 0;
427
428SquAcc(0, 1);
429
430SaveSquAcc(1, 2, 0);
431MulAcc(1, 1);
432
433SaveSquAcc(2, 0, 3);
434SquAcc(1, 2);
435
436SaveSquAcc(3, 3, 1);
437MulAcc(2, 2);
438
439SaveSquAcc(4, 2, 3);
440
441R[5] = c;
442p = (dword)A[3] * A[3] + d;
443R[6] = LOW_WORD(p);
444R[7] = e + HIGH_WORD(p);
445}
446
447void Portable::Multiply8(word *R, const word *A, const word *B)
448{
449dword p;
450word c, d, e;
451
452p = (dword)A[0] * B[0];
453R[0] = LOW_WORD(p);
454c = HIGH_WORD(p);
455d = e = 0;
456
457MulAcc(0, 1);
458MulAcc(1, 0);
459
460SaveMulAcc(1, 2, 0);
461MulAcc(1, 1);
462MulAcc(0, 2);
463
464SaveMulAcc(2, 0, 3);
465MulAcc(1, 2);
466MulAcc(2, 1);
467MulAcc(3, 0);
468
469SaveMulAcc(3, 0, 4);
470MulAcc(1, 3);
471MulAcc(2, 2);
472MulAcc(3, 1);
473MulAcc(4, 0);
474
475SaveMulAcc(4, 0, 5);
476MulAcc(1, 4);
477MulAcc(2, 3);
478MulAcc(3, 2);
479MulAcc(4, 1);
480MulAcc(5, 0);
481
482SaveMulAcc(5, 0, 6);
483MulAcc(1, 5);
484MulAcc(2, 4);
485MulAcc(3, 3);
486MulAcc(4, 2);
487MulAcc(5, 1);
488MulAcc(6, 0);
489
490SaveMulAcc(6, 0, 7);
491MulAcc(1, 6);
492MulAcc(2, 5);
493MulAcc(3, 4);
494MulAcc(4, 3);
495MulAcc(5, 2);
496MulAcc(6, 1);
497MulAcc(7, 0);
498
499SaveMulAcc(7, 1, 7);
500MulAcc(2, 6);
501MulAcc(3, 5);
502MulAcc(4, 4);
503MulAcc(5, 3);
504MulAcc(6, 2);
505MulAcc(7, 1);
506
507SaveMulAcc(8, 2, 7);
508MulAcc(3, 6);
509MulAcc(4, 5);
510MulAcc(5, 4);
511MulAcc(6, 3);
512MulAcc(7, 2);
513
514SaveMulAcc(9, 3, 7);
515MulAcc(4, 6);
516MulAcc(5, 5);
517MulAcc(6, 4);
518MulAcc(7, 3);
519
520SaveMulAcc(10, 4, 7);
521MulAcc(5, 6);
522MulAcc(6, 5);
523MulAcc(7, 4);
524
525SaveMulAcc(11, 5, 7);
526MulAcc(6, 6);
527MulAcc(7, 5);
528
529SaveMulAcc(12, 6, 7);
530MulAcc(7, 6);
531
532R[13] = c;
533p = (dword)A[7] * B[7] + d;
534R[14] = LOW_WORD(p);
535R[15] = e + HIGH_WORD(p);
536}
537
538void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
539{
540dword p;
541word c, d, e;
542
543p = (dword)A[0] * B[0];
544R[0] = LOW_WORD(p);
545c = HIGH_WORD(p);
546d = e = 0;
547
548MulAcc(0, 1);
549MulAcc(1, 0);
550
551SaveMulAcc(1, 2, 0);
552MulAcc(1, 1);
553MulAcc(0, 2);
554
555R[2] = c;
556R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
557}
558
559void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
560{
561dword p;
562word c, d, e;
563
564p = (dword)A[0] * B[0];
565R[0] = LOW_WORD(p);
566c = HIGH_WORD(p);
567d = e = 0;
568
569MulAcc(0, 1);
570MulAcc(1, 0);
571
572SaveMulAcc(1, 2, 0);
573MulAcc(1, 1);
574MulAcc(0, 2);
575
576SaveMulAcc(2, 0, 3);
577MulAcc(1, 2);
578MulAcc(2, 1);
579MulAcc(3, 0);
580
581SaveMulAcc(3, 0, 4);
582MulAcc(1, 3);
583MulAcc(2, 2);
584MulAcc(3, 1);
585MulAcc(4, 0);
586
587SaveMulAcc(4, 0, 5);
588MulAcc(1, 4);
589MulAcc(2, 3);
590MulAcc(3, 2);
591MulAcc(4, 1);
592MulAcc(5, 0);
593
594SaveMulAcc(5, 0, 6);
595MulAcc(1, 5);
596MulAcc(2, 4);
597MulAcc(3, 3);
598MulAcc(4, 2);
599MulAcc(5, 1);
600MulAcc(6, 0);
601
602R[6] = c;
603R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
604A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
605}
606
607#undef MulAcc
608#undef SaveMulAcc
609#undef SquAcc
610#undef SaveSquAcc
611
612// CodeWarrior defines _MSC_VER
613#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
614
615class PentiumOptimized : public Portable
616{
617public:
618static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
619static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
620static inline void Square4(word *R, const word *A)
621{
622// VC60 workaround: MSVC 6.0 has an optimization bug that makes
623// (dword)A*B where either A or B has been cast to a dword before
624// very expensive. Revisit this function when this
625// bug is fixed.
626Multiply4(R, A, A);
627}
628};
629
630typedef PentiumOptimized LowLevel;
631
632__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
633{
634__asm
635{
636push ebp
637push ebx
638push esi
639push edi
640
641mov esi, [esp+24]; N
642mov ebx, [esp+20]; B
643
644// now: ebx = B, ecx = C, edx = A, esi = N
645
646sub ecx, edx// hold the distance between C & A so we can add this to A to get C
647xor eax, eax// clear eax
648
649sub eax, esi// eax is a negative index from end of B
650lea ebx, [ebx+4*esi]// ebx is end of B
651
652sar eax, 1// unit of eax is now dwords; this also clears the carry flag
653jzloopend// if no dwords then nothing to do
654
655loopstart:
656mov esi,[edx]// load lower word of A
657mov ebp,[edx+4]// load higher word of A
658
659mov edi,[ebx+8*eax]// load lower word of B
660lea edx,[edx+8]// advance A and C
661
662adc esi,edi// add lower words
663mov edi,[ebx+8*eax+4]// load higher word of B
664
665adc ebp,edi// add higher words
666inc eax// advance B
667
668mov [edx+ecx-8],esi// store lower word result
669mov [edx+ecx-4],ebp// store higher word result
670
671jnz loopstart// loop until eax overflows and becomes zero
672
673loopend:
674adc eax, 0// store carry into eax (return result register)
675pop edi
676pop esi
677pop ebx
678pop ebp
679ret 8
680}
681}
682
683__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
684{
685__asm
686{
687push ebp
688push ebx
689push esi
690push edi
691
692mov esi, [esp+24]; N
693mov ebx, [esp+20]; B
694
695sub ecx, edx
696xor eax, eax
697
698sub eax, esi
699lea ebx, [ebx+4*esi]
700
701sar eax, 1
702jzloopend
703
704loopstart:
705mov esi,[edx]
706mov ebp,[edx+4]
707
708mov edi,[ebx+8*eax]
709lea edx,[edx+8]
710
711sbb esi,edi
712mov edi,[ebx+8*eax+4]
713
714sbb ebp,edi
715inc eax
716
717mov [edx+ecx-8],esi
718mov [edx+ecx-4],ebp
719
720jnz loopstart
721
722loopend:
723adc eax, 0
724pop edi
725pop esi
726pop ebx
727pop ebp
728ret 8
729}
730}
731
732#ifdef SSE2_INTRINSICS_AVAILABLE
733
734static bool GetSSE2Capability()
735{
736word32 b;
737
738__asm
739{
740moveax, 1
741cpuid
742movb, edx
743}
744
745return (b & (1 << 26)) != 0;
746}
747
748bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
749
750static inline bool HasSSE2()
751{
752if (g_sse2Enabled && !g_sse2DetectionDone)
753{
754g_sse2Detected = GetSSE2Capability();
755g_sse2DetectionDone = true;
756}
757return g_sse2Enabled && g_sse2Detected;
758}
759
760class P4Optimized : public PentiumOptimized
761{
762public:
763static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
764static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
765static void Multiply4(word *C, const word *A, const word *B);
766static void Multiply8(word *C, const word *A, const word *B);
767static inline void Square4(word *R, const word *A)
768{
769Multiply4(R, A, A);
770}
771static void Multiply8Bottom(word *C, const word *A, const word *B);
772};
773
774static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
775{
776__m128i a3210 = _mm_load_si128(A);
777__m128i b3210 = _mm_load_si128(B);
778
779__m128i sum;
780
781__m128i z = _mm_setzero_si128();
782__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
783C[0] = a2b2_a0b0;
784
785__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
786__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
787__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
788__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
789__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
790C[1] = _mm_add_epi64(a1b0, a0b1);
791
792__m128i a31 = _mm_srli_epi64(a3210, 32);
793__m128i b31 = _mm_srli_epi64(b3210, 32);
794__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
795C[6] = a3b3_a1b1;
796
797__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
798__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
799__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
800__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
801__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
802sum = _mm_add_epi64(a1b1, a0b2);
803C[2] = _mm_add_epi64(sum, a2b0);
804
805__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
806__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
807__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
808__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
809__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
810__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
811__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
812__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
813__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
814sum = _mm_add_epi64(a2b1, a0b3);
815C[3] = _mm_add_epi64(sum, sum1);
816
817__m128ia3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
818__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
819__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
820__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
821sum = _mm_add_epi64(a2b2, a3b1);
822C[4] = _mm_add_epi64(sum, a1b3);
823
824__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
825__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
826__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
827__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
828__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
829C[5] = _mm_add_epi64(a3b2, a2b3);
830}
831
832void P4Optimized::Multiply4(word *C, const word *A, const word *B)
833{
834__m128i temp[7];
835const word *w = (word *)temp;
836const __m64 *mw = (__m64 *)w;
837
838P4_Mul(temp, (__m128i *)A, (__m128i *)B);
839
840C[0] = w[0];
841
842__m64 s1, s2;
843
844__m64 w1 = _m_from_int(w[1]);
845__m64 w4 = mw[2];
846__m64 w6 = mw[3];
847__m64 w8 = mw[4];
848__m64 w10 = mw[5];
849__m64 w12 = mw[6];
850__m64 w14 = mw[7];
851__m64 w16 = mw[8];
852__m64 w18 = mw[9];
853__m64 w20 = mw[10];
854__m64 w22 = mw[11];
855__m64 w26 = _m_from_int(w[26]);
856
857s1 = _mm_add_si64(w1, w4);
858C[1] = _m_to_int(s1);
859s1 = _m_psrlqi(s1, 32);
860
861s2 = _mm_add_si64(w6, w8);
862s1 = _mm_add_si64(s1, s2);
863C[2] = _m_to_int(s1);
864s1 = _m_psrlqi(s1, 32);
865
866s2 = _mm_add_si64(w10, w12);
867s1 = _mm_add_si64(s1, s2);
868C[3] = _m_to_int(s1);
869s1 = _m_psrlqi(s1, 32);
870
871s2 = _mm_add_si64(w14, w16);
872s1 = _mm_add_si64(s1, s2);
873C[4] = _m_to_int(s1);
874s1 = _m_psrlqi(s1, 32);
875
876s2 = _mm_add_si64(w18, w20);
877s1 = _mm_add_si64(s1, s2);
878C[5] = _m_to_int(s1);
879s1 = _m_psrlqi(s1, 32);
880
881s2 = _mm_add_si64(w22, w26);
882s1 = _mm_add_si64(s1, s2);
883C[6] = _m_to_int(s1);
884s1 = _m_psrlqi(s1, 32);
885
886C[7] = _m_to_int(s1) + w[27];
887_mm_empty();
888}
889
890void P4Optimized::Multiply8(word *C, const word *A, const word *B)
891{
892__m128i temp[28];
893const word *w = (word *)temp;
894const __m64 *mw = (__m64 *)w;
895const word *x = (word *)temp+7*4;
896const __m64 *mx = (__m64 *)x;
897const word *y = (word *)temp+7*4*2;
898const __m64 *my = (__m64 *)y;
899const word *z = (word *)temp+7*4*3;
900const __m64 *mz = (__m64 *)z;
901
902P4_Mul(temp, (__m128i *)A, (__m128i *)B);
903
904P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
905
906P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
907
908P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
909
910C[0] = w[0];
911
912__m64 s1, s2, s3, s4;
913
914__m64 w1 = _m_from_int(w[1]);
915__m64 w4 = mw[2];
916__m64 w6 = mw[3];
917__m64 w8 = mw[4];
918__m64 w10 = mw[5];
919__m64 w12 = mw[6];
920__m64 w14 = mw[7];
921__m64 w16 = mw[8];
922__m64 w18 = mw[9];
923__m64 w20 = mw[10];
924__m64 w22 = mw[11];
925__m64 w26 = _m_from_int(w[26]);
926__m64 w27 = _m_from_int(w[27]);
927
928__m64 x0 = _m_from_int(x[0]);
929__m64 x1 = _m_from_int(x[1]);
930__m64 x4 = mx[2];
931__m64 x6 = mx[3];
932__m64 x8 = mx[4];
933__m64 x10 = mx[5];
934__m64 x12 = mx[6];
935__m64 x14 = mx[7];
936__m64 x16 = mx[8];
937__m64 x18 = mx[9];
938__m64 x20 = mx[10];
939__m64 x22 = mx[11];
940__m64 x26 = _m_from_int(x[26]);
941__m64 x27 = _m_from_int(x[27]);
942
943__m64 y0 = _m_from_int(y[0]);
944__m64 y1 = _m_from_int(y[1]);
945__m64 y4 = my[2];
946__m64 y6 = my[3];
947__m64 y8 = my[4];
948__m64 y10 = my[5];
949__m64 y12 = my[6];
950__m64 y14 = my[7];
951__m64 y16 = my[8];
952__m64 y18 = my[9];
953__m64 y20 = my[10];
954__m64 y22 = my[11];
955__m64 y26 = _m_from_int(y[26]);
956__m64 y27 = _m_from_int(y[27]);
957
958__m64 z0 = _m_from_int(z[0]);
959__m64 z1 = _m_from_int(z[1]);
960__m64 z4 = mz[2];
961__m64 z6 = mz[3];
962__m64 z8 = mz[4];
963__m64 z10 = mz[5];
964__m64 z12 = mz[6];
965__m64 z14 = mz[7];
966__m64 z16 = mz[8];
967__m64 z18 = mz[9];
968__m64 z20 = mz[10];
969__m64 z22 = mz[11];
970__m64 z26 = _m_from_int(z[26]);
971
972s1 = _mm_add_si64(w1, w4);
973C[1] = _m_to_int(s1);
974s1 = _m_psrlqi(s1, 32);
975
976s2 = _mm_add_si64(w6, w8);
977s1 = _mm_add_si64(s1, s2);
978C[2] = _m_to_int(s1);
979s1 = _m_psrlqi(s1, 32);
980
981s2 = _mm_add_si64(w10, w12);
982s1 = _mm_add_si64(s1, s2);
983C[3] = _m_to_int(s1);
984s1 = _m_psrlqi(s1, 32);
985
986s3 = _mm_add_si64(x0, y0);
987s2 = _mm_add_si64(w14, w16);
988s1 = _mm_add_si64(s1, s3);
989s1 = _mm_add_si64(s1, s2);
990C[4] = _m_to_int(s1);
991s1 = _m_psrlqi(s1, 32);
992
993s3 = _mm_add_si64(x1, y1);
994s4 = _mm_add_si64(x4, y4);
995s1 = _mm_add_si64(s1, w18);
996s3 = _mm_add_si64(s3, s4);
997s1 = _mm_add_si64(s1, w20);
998s1 = _mm_add_si64(s1, s3);
999C[5] = _m_to_int(s1);
1000s1 = _m_psrlqi(s1, 32);
1001
1002s3 = _mm_add_si64(x6, y6);
1003s4 = _mm_add_si64(x8, y8);
1004s1 = _mm_add_si64(s1, w22);
1005s3 = _mm_add_si64(s3, s4);
1006s1 = _mm_add_si64(s1, w26);
1007s1 = _mm_add_si64(s1, s3);
1008C[6] = _m_to_int(s1);
1009s1 = _m_psrlqi(s1, 32);
1010
1011s3 = _mm_add_si64(x10, y10);
1012s4 = _mm_add_si64(x12, y12);
1013s1 = _mm_add_si64(s1, w27);
1014s3 = _mm_add_si64(s3, s4);
1015s1 = _mm_add_si64(s1, s3);
1016C[7] = _m_to_int(s1);
1017s1 = _m_psrlqi(s1, 32);
1018
1019s3 = _mm_add_si64(x14, y14);
1020s4 = _mm_add_si64(x16, y16);
1021s1 = _mm_add_si64(s1, z0);
1022s3 = _mm_add_si64(s3, s4);
1023s1 = _mm_add_si64(s1, s3);
1024C[8] = _m_to_int(s1);
1025s1 = _m_psrlqi(s1, 32);
1026
1027s3 = _mm_add_si64(x18, y18);
1028s4 = _mm_add_si64(x20, y20);
1029s1 = _mm_add_si64(s1, z1);
1030s3 = _mm_add_si64(s3, s4);
1031s1 = _mm_add_si64(s1, z4);
1032s1 = _mm_add_si64(s1, s3);
1033C[9] = _m_to_int(s1);
1034s1 = _m_psrlqi(s1, 32);
1035
1036s3 = _mm_add_si64(x22, y22);
1037s4 = _mm_add_si64(x26, y26);
1038s1 = _mm_add_si64(s1, z6);
1039s3 = _mm_add_si64(s3, s4);
1040s1 = _mm_add_si64(s1, z8);
1041s1 = _mm_add_si64(s1, s3);
1042C[10] = _m_to_int(s1);
1043s1 = _m_psrlqi(s1, 32);
1044
1045s3 = _mm_add_si64(x27, y27);
1046s1 = _mm_add_si64(s1, z10);
1047s1 = _mm_add_si64(s1, z12);
1048s1 = _mm_add_si64(s1, s3);
1049C[11] = _m_to_int(s1);
1050s1 = _m_psrlqi(s1, 32);
1051
1052s3 = _mm_add_si64(z14, z16);
1053s1 = _mm_add_si64(s1, s3);
1054C[12] = _m_to_int(s1);
1055s1 = _m_psrlqi(s1, 32);
1056
1057s3 = _mm_add_si64(z18, z20);
1058s1 = _mm_add_si64(s1, s3);
1059C[13] = _m_to_int(s1);
1060s1 = _m_psrlqi(s1, 32);
1061
1062s3 = _mm_add_si64(z22, z26);
1063s1 = _mm_add_si64(s1, s3);
1064C[14] = _m_to_int(s1);
1065s1 = _m_psrlqi(s1, 32);
1066
1067C[15] = z[27] + _m_to_int(s1);
1068_mm_empty();
1069}
1070
1071void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1072{
1073__m128i temp[21];
1074const word *w = (word *)temp;
1075const __m64 *mw = (__m64 *)w;
1076const word *x = (word *)temp+7*4;
1077const __m64 *mx = (__m64 *)x;
1078const word *y = (word *)temp+7*4*2;
1079const __m64 *my = (__m64 *)y;
1080
1081P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1082
1083P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1084
1085P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1086
1087C[0] = w[0];
1088
1089__m64 s1, s2, s3, s4;
1090
1091__m64 w1 = _m_from_int(w[1]);
1092__m64 w4 = mw[2];
1093__m64 w6 = mw[3];
1094__m64 w8 = mw[4];
1095__m64 w10 = mw[5];
1096__m64 w12 = mw[6];
1097__m64 w14 = mw[7];
1098__m64 w16 = mw[8];
1099__m64 w18 = mw[9];
1100__m64 w20 = mw[10];
1101__m64 w22 = mw[11];
1102__m64 w26 = _m_from_int(w[26]);
1103
1104__m64 x0 = _m_from_int(x[0]);
1105__m64 x1 = _m_from_int(x[1]);
1106__m64 x4 = mx[2];
1107__m64 x6 = mx[3];
1108__m64 x8 = mx[4];
1109
1110__m64 y0 = _m_from_int(y[0]);
1111__m64 y1 = _m_from_int(y[1]);
1112__m64 y4 = my[2];
1113__m64 y6 = my[3];
1114__m64 y8 = my[4];
1115
1116s1 = _mm_add_si64(w1, w4);
1117C[1] = _m_to_int(s1);
1118s1 = _m_psrlqi(s1, 32);
1119
1120s2 = _mm_add_si64(w6, w8);
1121s1 = _mm_add_si64(s1, s2);
1122C[2] = _m_to_int(s1);
1123s1 = _m_psrlqi(s1, 32);
1124
1125s2 = _mm_add_si64(w10, w12);
1126s1 = _mm_add_si64(s1, s2);
1127C[3] = _m_to_int(s1);
1128s1 = _m_psrlqi(s1, 32);
1129
1130s3 = _mm_add_si64(x0, y0);
1131s2 = _mm_add_si64(w14, w16);
1132s1 = _mm_add_si64(s1, s3);
1133s1 = _mm_add_si64(s1, s2);
1134C[4] = _m_to_int(s1);
1135s1 = _m_psrlqi(s1, 32);
1136
1137s3 = _mm_add_si64(x1, y1);
1138s4 = _mm_add_si64(x4, y4);
1139s1 = _mm_add_si64(s1, w18);
1140s3 = _mm_add_si64(s3, s4);
1141s1 = _mm_add_si64(s1, w20);
1142s1 = _mm_add_si64(s1, s3);
1143C[5] = _m_to_int(s1);
1144s1 = _m_psrlqi(s1, 32);
1145
1146s3 = _mm_add_si64(x6, y6);
1147s4 = _mm_add_si64(x8, y8);
1148s1 = _mm_add_si64(s1, w22);
1149s3 = _mm_add_si64(s3, s4);
1150s1 = _mm_add_si64(s1, w26);
1151s1 = _mm_add_si64(s1, s3);
1152C[6] = _m_to_int(s1);
1153s1 = _m_psrlqi(s1, 32);
1154
1155C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1156_mm_empty();
1157}
1158
1159__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
1160{
1161__asm
1162{
1163subesp, 16
1164xoreax, eax
1165mov[esp], edi
1166mov[esp+4], esi
1167mov[esp+8], ebx
1168mov[esp+12], ebp
1169
1170movebx, [esp+20]// B
1171movesi, [esp+24]// N
1172
1173// now: ebx = B, ecx = C, edx = A, esi = N
1174
1175negesi
1176jzloopend// if no dwords then nothing to do
1177
1178movedi, [edx]
1179movebp, [ebx]
1180
1181loopstart:
1182addedi, eax
1183jccarry1
1184
1185xoreax, eax
1186
1187carry1continue:
1188addedi, ebp
1189movebp, 1
1190mov[ecx], edi
1191movedi, [edx+4]
1192cmovceax, ebp
1193movebp, [ebx+4]
1194leaebx, [ebx+8]
1195addedi, eax
1196jccarry2
1197
1198xoreax, eax
1199
1200carry2continue:
1201addedi, ebp
1202movebp, 1
1203cmovceax, ebp
1204mov[ecx+4], edi
1205addecx, 8
1206movedi, [edx+8]
1207addedx, 8
1208addesi, 2
1209movebp, [ebx]
1210jnzloopstart
1211
1212loopend:
1213movedi, [esp]
1214movesi, [esp+4]
1215movebx, [esp+8]
1216movebp, [esp+12]
1217addesp, 16
1218ret8
1219
1220carry1:
1221moveax, 1
1222jmpcarry1continue
1223
1224carry2:
1225moveax, 1
1226jmpcarry2continue
1227}
1228}
1229
1230__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
1231{
1232__asm
1233{
1234subesp, 16
1235xoreax, eax
1236mov[esp], edi
1237mov[esp+4], esi
1238mov[esp+8], ebx
1239mov[esp+12], ebp
1240
1241movebx, [esp+20]// B
1242movesi, [esp+24]// N
1243
1244// now: ebx = B, ecx = C, edx = A, esi = N
1245
1246negesi
1247jzloopend// if no dwords then nothing to do
1248
1249movedi, [edx]
1250movebp, [ebx]
1251
1252loopstart:
1253subedi, eax
1254jccarry1
1255
1256xoreax, eax
1257
1258carry1continue:
1259subedi, ebp
1260movebp, 1
1261mov[ecx], edi
1262movedi, [edx+4]
1263cmovceax, ebp
1264movebp, [ebx+4]
1265leaebx, [ebx+8]
1266subedi, eax
1267jccarry2
1268
1269xoreax, eax
1270
1271carry2continue:
1272subedi, ebp
1273movebp, 1
1274cmovceax, ebp
1275mov[ecx+4], edi
1276addecx, 8
1277movedi, [edx+8]
1278addedx, 8
1279addesi, 2
1280movebp, [ebx]
1281jnzloopstart
1282
1283loopend:
1284movedi, [esp]
1285movesi, [esp+4]
1286movebx, [esp+8]
1287movebp, [esp+12]
1288addesp, 16
1289ret8
1290
1291carry1:
1292moveax, 1
1293jmpcarry1continue
1294
1295carry2:
1296moveax, 1
1297jmpcarry2continue
1298}
1299}
1300
1301#endif// #ifdef SSE2_INTRINSICS_AVAILABLE
1302
1303#elif defined(__GNUC__) && defined(__i386__)
1304
1305class PentiumOptimized : public Portable
1306{
1307public:
1308#ifndef __pic__// -fpic uses up a register, leaving too few for the asm code
1309static word Add(word *C, const word *A, const word *B, unsigned int N);
1310static word Subtract(word *C, const word *A, const word *B, unsigned int N);
1311#endif
1312static void Square4(word *R, const word *A);
1313static void Multiply4(word *C, const word *A, const word *B);
1314static void Multiply8(word *C, const word *A, const word *B);
1315};
1316
1317typedef PentiumOptimized LowLevel;
1318
1319// Add and Subtract assembly code originally contributed by Alister Lee
1320
1321#ifndef __pic__
1322__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
1323{
1324assert (N%2 == 0);
1325
1326register word carry, temp;
1327
1328__asm__ __volatile__(
1329"push %%ebp;"
1330"sub %3, %2;"
1331"xor %0, %0;"
1332"sub %4, %0;"
1333"lea (%1,%4,4), %1;"
1334"sar $1, %0;"
1335"jz 1f;"
1336
1337"0:;"
1338"mov 0(%3), %4;"
1339"mov 4(%3), %%ebp;"
1340"mov (%1,%0,8), %5;"
1341"lea 8(%3), %3;"
1342"adc %5, %4;"
1343"mov 4(%1,%0,8), %5;"
1344"adc %5, %%ebp;"
1345"inc %0;"
1346"mov %4, -8(%3, %2);"
1347"mov %%ebp, -4(%3, %2);"
1348"jnz 0b;"
1349
1350"1:;"
1351"adc $0, %0;"
1352"pop %%ebp;"
1353
1354: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
1355: : "cc", "memory");
1356
1357return carry;
1358}
1359
1360__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
1361{
1362assert (N%2 == 0);
1363
1364register word carry, temp;
1365
1366__asm__ __volatile__(
1367"push %%ebp;"
1368"sub %3, %2;"
1369"xor %0, %0;"
1370"sub %4, %0;"
1371"lea (%1,%4,4), %1;"
1372"sar $1, %0;"
1373"jz 1f;"
1374
1375"0:;"
1376"mov 0(%3), %4;"
1377"mov 4(%3), %%ebp;"
1378"mov (%1,%0,8), %5;"
1379"lea 8(%3), %3;"
1380"sbb %5, %4;"
1381"mov 4(%1,%0,8), %5;"
1382"sbb %5, %%ebp;"
1383"inc %0;"
1384"mov %4, -8(%3, %2);"
1385"mov %%ebp, -4(%3, %2);"
1386"jnz 0b;"
1387
1388"1:;"
1389"adc $0, %0;"
1390"pop %%ebp;"
1391
1392: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
1393: : "cc", "memory");
1394
1395return carry;
1396}
1397#endif// __pic__
1398
1399// Comba square and multiply assembly code originally contributed by Leonard Janke
1400
1401#define SqrStartup \
1402 "push %%ebp\n\t" \
1403 "push %%esi\n\t" \
1404 "push %%ebx\n\t" \
1405 "xor %%ebp, %%ebp\n\t" \
1406 "xor %%ebx, %%ebx\n\t" \
1407 "xor %%ecx, %%ecx\n\t"
1408
1409#define SqrShiftCarry \
1410 "mov %%ebx, %%ebp\n\t" \
1411 "mov %%ecx, %%ebx\n\t" \
1412 "xor %%ecx, %%ecx\n\t"
1413
1414#define SqrAccumulate(i,j) \
1415 "mov 4*"#j"(%%esi), %%eax\n\t" \
1416 "mull 4*"#i"(%%esi)\n\t" \
1417 "add %%eax, %%ebp\n\t" \
1418 "adc %%edx, %%ebx\n\t" \
1419 "adc %%ch, %%cl\n\t" \
1420 "add %%eax, %%ebp\n\t" \
1421 "adc %%edx, %%ebx\n\t" \
1422 "adc %%ch, %%cl\n\t"
1423
1424#define SqrAccumulateCentre(i) \
1425 "mov 4*"#i"(%%esi), %%eax\n\t" \
1426 "mull 4*"#i"(%%esi)\n\t" \
1427 "add %%eax, %%ebp\n\t" \
1428 "adc %%edx, %%ebx\n\t" \
1429 "adc %%ch, %%cl\n\t"
1430
1431#define SqrStoreDigit(X) \
1432 "mov %%ebp, 4*"#X"(%%edi)\n\t" \
1433
1434#define SqrLastDiagonal(digits) \
1435 "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
1436 "mull 4*("#digits"-1)(%%esi)\n\t" \
1437 "add %%eax, %%ebp\n\t" \
1438 "adc %%edx, %%ebx\n\t" \
1439 "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
1440 "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
1441
1442#define SqrCleanup \
1443 "pop %%ebx\n\t" \
1444 "pop %%esi\n\t" \
1445 "pop %%ebp\n\t"
1446
1447void PentiumOptimized::Square4(word* Y, const word* X)
1448{
1449__asm__ __volatile__(
1450SqrStartup
1451
1452SqrAccumulateCentre(0)
1453SqrStoreDigit(0)
1454SqrShiftCarry
1455
1456SqrAccumulate(1,0)
1457SqrStoreDigit(1)
1458SqrShiftCarry
1459
1460SqrAccumulate(2,0)
1461SqrAccumulateCentre(1)
1462SqrStoreDigit(2)
1463SqrShiftCarry
1464
1465SqrAccumulate(3,0)
1466SqrAccumulate(2,1)
1467SqrStoreDigit(3)
1468SqrShiftCarry
1469
1470SqrAccumulate(3,1)
1471SqrAccumulateCentre(2)
1472SqrStoreDigit(4)
1473SqrShiftCarry
1474
1475SqrAccumulate(3,2)
1476SqrStoreDigit(5)
1477SqrShiftCarry
1478
1479SqrLastDiagonal(4)
1480
1481SqrCleanup
1482
1483:
1484: "D" (Y), "S" (X)
1485: "eax", "ecx", "edx", "ebp", "memory"
1486);
1487}
1488
1489#define MulStartup \
1490 "push %%ebp\n\t" \
1491 "push %%esi\n\t" \
1492 "push %%ebx\n\t" \
1493 "push %%edi\n\t" \
1494 "mov %%eax, %%ebx \n\t" \
1495 "xor %%ebp, %%ebp\n\t" \
1496 "xor %%edi, %%edi\n\t" \
1497 "xor %%ecx, %%ecx\n\t"
1498
1499#define MulShiftCarry \
1500 "mov %%edx, %%ebp\n\t" \
1501 "mov %%ecx, %%edi\n\t" \
1502 "xor %%ecx, %%ecx\n\t"
1503
1504#define MulAccumulate(i,j) \
1505 "mov 4*"#j"(%%ebx), %%eax\n\t" \
1506 "mull 4*"#i"(%%esi)\n\t" \
1507 "add %%eax, %%ebp\n\t" \
1508 "adc %%edx, %%edi\n\t" \
1509 "adc %%ch, %%cl\n\t"
1510
1511#define MulStoreDigit(X) \
1512 "mov %%edi, %%edx \n\t" \
1513 "mov (%%esp), %%edi \n\t" \
1514 "mov %%ebp, 4*"#X"(%%edi)\n\t" \
1515 "mov %%edi, (%%esp)\n\t"
1516
1517#define MulLastDiagonal(digits) \
1518 "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
1519 "mull 4*("#digits"-1)(%%esi)\n\t" \
1520 "add %%eax, %%ebp\n\t" \
1521 "adc %%edi, %%edx\n\t" \
1522 "mov (%%esp), %%edi\n\t" \
1523 "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
1524 "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
1525
1526#define MulCleanup \
1527 "pop %%edi\n\t" \
1528 "pop %%ebx\n\t" \
1529 "pop %%esi\n\t" \
1530 "pop %%ebp\n\t"
1531
1532void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1533{
1534__asm__ __volatile__(
1535MulStartup
1536MulAccumulate(0,0)
1537MulStoreDigit(0)
1538MulShiftCarry
1539
1540MulAccumulate(1,0)
1541MulAccumulate(0,1)
1542MulStoreDigit(1)
1543MulShiftCarry
1544
1545MulAccumulate(2,0)
1546MulAccumulate(1,1)
1547MulAccumulate(0,2)
1548MulStoreDigit(2)
1549MulShiftCarry
1550
1551MulAccumulate(3,0)
1552MulAccumulate(2,1)
1553MulAccumulate(1,2)
1554MulAccumulate(0,3)
1555MulStoreDigit(3)
1556MulShiftCarry
1557
1558MulAccumulate(3,1)
1559MulAccumulate(2,2)
1560MulAccumulate(1,3)
1561MulStoreDigit(4)
1562MulShiftCarry
1563
1564MulAccumulate(3,2)
1565MulAccumulate(2,3)
1566MulStoreDigit(5)
1567MulShiftCarry
1568
1569MulLastDiagonal(4)
1570
1571MulCleanup
1572
1573:
1574: "D" (Z), "S" (X), "a" (Y)
1575: "%ecx", "%edx", "memory"
1576);
1577}
1578
1579void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1580{
1581__asm__ __volatile__(
1582MulStartup
1583MulAccumulate(0,0)
1584MulStoreDigit(0)
1585MulShiftCarry
1586
1587MulAccumulate(1,0)
1588MulAccumulate(0,1)
1589MulStoreDigit(1)
1590MulShiftCarry
1591
1592MulAccumulate(2,0)
1593MulAccumulate(1,1)
1594MulAccumulate(0,2)
1595MulStoreDigit(2)
1596MulShiftCarry
1597
1598MulAccumulate(3,0)
1599MulAccumulate(2,1)
1600MulAccumulate(1,2)
1601MulAccumulate(0,3)
1602MulStoreDigit(3)
1603MulShiftCarry
1604
1605MulAccumulate(4,0)
1606MulAccumulate(3,1)
1607MulAccumulate(2,2)
1608MulAccumulate(1,3)
1609MulAccumulate(0,4)
1610MulStoreDigit(4)
1611MulShiftCarry
1612
1613MulAccumulate(5,0)
1614MulAccumulate(4,1)
1615MulAccumulate(3,2)
1616MulAccumulate(2,3)
1617MulAccumulate(1,4)
1618MulAccumulate(0,5)
1619MulStoreDigit(5)
1620MulShiftCarry
1621
1622MulAccumulate(6,0)
1623MulAccumulate(5,1)
1624MulAccumulate(4,2)
1625MulAccumulate(3,3)
1626MulAccumulate(2,4)
1627MulAccumulate(1,5)
1628MulAccumulate(0,6)
1629MulStoreDigit(6)
1630MulShiftCarry
1631
1632MulAccumulate(7,0)
1633MulAccumulate(6,1)
1634MulAccumulate(5,2)
1635MulAccumulate(4,3)
1636MulAccumulate(3,4)
1637MulAccumulate(2,5)
1638MulAccumulate(1,6)
1639MulAccumulate(0,7)
1640MulStoreDigit(7)
1641MulShiftCarry
1642
1643MulAccumulate(7,1)
1644MulAccumulate(6,2)
1645MulAccumulate(5,3)
1646MulAccumulate(4,4)
1647MulAccumulate(3,5)
1648MulAccumulate(2,6)
1649MulAccumulate(1,7)
1650MulStoreDigit(8)
1651MulShiftCarry
1652
1653MulAccumulate(7,2)
1654MulAccumulate(6,3)
1655MulAccumulate(5,4)
1656MulAccumulate(4,5)
1657MulAccumulate(3,6)
1658MulAccumulate(2,7)
1659MulStoreDigit(9)
1660MulShiftCarry
1661
1662MulAccumulate(7,3)
1663MulAccumulate(6,4)
1664MulAccumulate(5,5)
1665MulAccumulate(4,6)
1666MulAccumulate(3,7)
1667MulStoreDigit(10)
1668MulShiftCarry
1669
1670MulAccumulate(7,4)
1671MulAccumulate(6,5)
1672MulAccumulate(5,6)
1673MulAccumulate(4,7)
1674MulStoreDigit(11)
1675MulShiftCarry
1676
1677MulAccumulate(7,5)
1678MulAccumulate(6,6)
1679MulAccumulate(5,7)
1680MulStoreDigit(12)
1681MulShiftCarry
1682
1683MulAccumulate(7,6)
1684MulAccumulate(6,7)
1685MulStoreDigit(13)
1686MulShiftCarry
1687
1688MulLastDiagonal(8)
1689
1690MulCleanup
1691
1692:
1693: "D" (Z), "S" (X), "a" (Y)
1694: "%ecx", "%edx", "memory"
1695);
1696}
1697
1698#elif defined(__GNUC__) && defined(__alpha__)
1699
1700class AlphaOptimized : public Portable
1701{
1702public:
1703static inline void Multiply2(word *C, const word *A, const word *B);
1704static inline word Multiply2Add(word *C, const word *A, const word *B);
1705static inline void Multiply4(word *C, const word *A, const word *B);
1706static inline unsigned int MultiplyRecursionLimit() {return 4;}
1707
1708static inline void Multiply4Bottom(word *C, const word *A, const word *B);
1709static inline unsigned int MultiplyBottomRecursionLimit() {return 4;}
1710
1711static inline void Square4(word *R, const word *A)
1712{
1713Multiply4(R, A, A);
1714}
1715};
1716
1717typedef AlphaOptimized LowLevel;
1718
1719inline void AlphaOptimized::Multiply2(word *C, const word *A, const word *B)
1720{
1721register dword c, a = *(const dword *)A, b = *(const dword *)B;
1722((dword *)C)[0] = a*b;
1723__asm__("umulh %1,%2,%0" : "=r" (c) : "r" (a), "r" (b));
1724((dword *)C)[1] = c;
1725}
1726
1727inline word AlphaOptimized::Multiply2Add(word *C, const word *A, const word *B)
1728{
1729register dword c, d, e, a = *(const dword *)A, b = *(const dword *)B;
1730c = ((dword *)C)[0];
1731d = a*b + c;
1732__asm__("umulh %1,%2,%0" : "=r" (e) : "r" (a), "r" (b));
1733((dword *)C)[0] = d;
1734d = (d < c);
1735c = ((dword *)C)[1] + d;
1736d = (c < d);
1737c += e;
1738((dword *)C)[1] = c;
1739d |= (c < e);
1740return d;
1741}
1742
1743inline void AlphaOptimized::Multiply4(word *R, const word *A, const word *B)
1744{
1745Multiply2(R, A, B);
1746Multiply2(R+4, A+2, B+2);
1747word carry = Multiply2Add(R+2, A+0, B+2);
1748carry += Multiply2Add(R+2, A+2, B+0);
1749Increment(R+6, 2, carry);
1750}
1751
1752static inline void Multiply2BottomAdd(word *C, const word *A, const word *B)
1753{
1754register dword a = *(const dword *)A, b = *(const dword *)B;
1755((dword *)C)[0] = a*b + ((dword *)C)[0];
1756}
1757
1758inline void AlphaOptimized::Multiply4Bottom(word *R, const word *A, const word *B)
1759{
1760Multiply2(R, A, B);
1761Multiply2BottomAdd(R+2, A+0, B+2);
1762Multiply2BottomAdd(R+2, A+2, B+0);
1763}
1764
1765#else// no processor specific code available
1766
1767typedef Portable LowLevel;
1768
1769#endif
1770
1771// ********************************************************
1772
1773#define A0A
1774#define A1(A+N2)
1775#define B0B
1776#define B1(B+N2)
1777
1778#define T0T
1779#define T1(T+N2)
1780#define T2(T+N)
1781#define T3(T+N+N2)
1782
1783#define R0R
1784#define R1(R+N2)
1785#define R2(R+N)
1786#define R3(R+N+N2)
1787
1788//VC60 workaround: compiler bug triggered without the extra dummy parameters
1789
1790// R[2*N] - result = A*B
1791// T[2*N] - temporary work space
1792// A[N] --- multiplier
1793// B[N] --- multiplicant
1794
1795template <class P>
1796void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
1797
1798template <class P>
1799inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
1800{
1801assert(N>=2 && N%2==0);
1802
1803if (P::MultiplyRecursionLimit() >= 8 && N==8)
1804P::Multiply8(R, A, B);
1805else if (P::MultiplyRecursionLimit() >= 4 && N==4)
1806P::Multiply4(R, A, B);
1807else if (N==2)
1808P::Multiply2(R, A, B);
1809else
1810DoRecursiveMultiply<P>(R, T, A, B, N, NULL);// VC60 workaround: needs this NULL
1811}
1812
1813template <class P>
1814void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
1815{
1816const unsigned int N2 = N/2;
1817int carry;
1818
1819int aComp = Compare(A0, A1, N2);
1820int bComp = Compare(B0, B1, N2);
1821
1822switch (2*aComp + aComp + bComp)
1823{
1824case -4:
1825P::Subtract(R0, A1, A0, N2);
1826P::Subtract(R1, B0, B1, N2);
1827RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1828P::Subtract(T1, T1, R0, N2);
1829carry = -1;
1830break;
1831case -2:
1832P::Subtract(R0, A1, A0, N2);
1833P::Subtract(R1, B0, B1, N2);
1834RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1835carry = 0;
1836break;
1837case 2:
1838P::Subtract(R0, A0, A1, N2);
1839P::Subtract(R1, B1, B0, N2);
1840RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1841carry = 0;
1842break;
1843case 4:
1844P::Subtract(R0, A1, A0, N2);
1845P::Subtract(R1, B0, B1, N2);
1846RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1847P::Subtract(T1, T1, R1, N2);
1848carry = -1;
1849break;
1850default:
1851SetWords(T0, 0, N);
1852carry = 0;
1853}
1854
1855RecursiveMultiply<P>(R0, T2, A0, B0, N2);
1856RecursiveMultiply<P>(R2, T2, A1, B1, N2);
1857
1858// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
1859
1860carry += P::Add(T0, T0, R0, N);
1861carry += P::Add(T0, T0, R2, N);
1862carry += P::Add(R1, R1, T0, N);
1863
1864assert (carry >= 0 && carry <= 2);
1865Increment(R3, N2, carry);
1866}
1867
1868// R[2*N] - result = A*A
1869// T[2*N] - temporary work space
1870// A[N] --- number to be squared
1871
1872template <class P>
1873void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
1874
1875template <class P>
1876inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
1877{
1878assert(N && N%2==0);
1879if (P::SquareRecursionLimit() >= 8 && N==8)
1880P::Square8(R, A);
1881if (P::SquareRecursionLimit() >= 4 && N==4)
1882P::Square4(R, A);
1883else if (N==2)
1884P::Square2(R, A);
1885else
1886DoRecursiveSquare<P>(R, T, A, N, NULL);// VC60 workaround: needs this NULL
1887}
1888
1889template <class P>
1890void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
1891{
1892const unsigned int N2 = N/2;
1893
1894RecursiveSquare<P>(R0, T2, A0, N2);
1895RecursiveSquare<P>(R2, T2, A1, N2);
1896RecursiveMultiply<P>(T0, T2, A0, A1, N2);
1897
1898word carry = P::Add(R1, R1, T0, N);
1899carry += P::Add(R1, R1, T0, N);
1900Increment(R3, N2, carry);
1901}
1902
1903// R[N] - bottom half of A*B
1904// T[N] - temporary work space
1905// A[N] - multiplier
1906// B[N] - multiplicant
1907
1908template <class P>
1909void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
1910
1911template <class P>
1912inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
1913{
1914assert(N>=2 && N%2==0);
1915if (P::MultiplyBottomRecursionLimit() >= 8 && N==8)
1916P::Multiply8Bottom(R, A, B);
1917else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4)
1918P::Multiply4Bottom(R, A, B);
1919else if (N==2)
1920P::Multiply2Bottom(R, A, B);
1921else
1922DoRecursiveMultiplyBottom<P>(R, T, A, B, N, NULL);
1923}
1924
1925template <class P>
1926void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
1927{
1928const unsigned int N2 = N/2;
1929
1930RecursiveMultiply<P>(R, T, A0, B0, N2);
1931RecursiveMultiplyBottom<P>(T0, T1, A1, B0, N2);
1932P::Add(R1, R1, T0, N2);
1933RecursiveMultiplyBottom<P>(T0, T1, A0, B1, N2);
1934P::Add(R1, R1, T0, N2);
1935}
1936
1937// R[N] --- upper half of A*B
1938// T[2*N] - temporary work space
1939// L[N] --- lower half of A*B
1940// A[N] --- multiplier
1941// B[N] --- multiplicant
1942
1943template <class P>
1944void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
1945{
1946assert(N>=2 && N%2==0);
1947
1948if (N==4)
1949{
1950P::Multiply4(T, A, B);
1951((dword *)R)[0] = ((dword *)T)[2];
1952((dword *)R)[1] = ((dword *)T)[3];
1953}
1954else if (N==2)
1955{
1956P::Multiply2(T, A, B);
1957((dword *)R)[0] = ((dword *)T)[1];
1958}
1959else
1960{
1961const unsigned int N2 = N/2;
1962int carry;
1963
1964int aComp = Compare(A0, A1, N2);
1965int bComp = Compare(B0, B1, N2);
1966
1967switch (2*aComp + aComp + bComp)
1968{
1969case -4:
1970P::Subtract(R0, A1, A0, N2);
1971P::Subtract(R1, B0, B1, N2);
1972RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1973P::Subtract(T1, T1, R0, N2);
1974carry = -1;
1975break;
1976case -2:
1977P::Subtract(R0, A1, A0, N2);
1978P::Subtract(R1, B0, B1, N2);
1979RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1980carry = 0;
1981break;
1982case 2:
1983P::Subtract(R0, A0, A1, N2);
1984P::Subtract(R1, B1, B0, N2);
1985RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1986carry = 0;
1987break;
1988case 4:
1989P::Subtract(R0, A1, A0, N2);
1990P::Subtract(R1, B0, B1, N2);
1991RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1992P::Subtract(T1, T1, R1, N2);
1993carry = -1;
1994break;
1995default:
1996SetWords(T0, 0, N);
1997carry = 0;
1998}
1999
2000RecursiveMultiply<P>(T2, R0, A1, B1, N2);
2001
2002// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
2003
2004word c2 = P::Subtract(R0, L+N2, L, N2);
2005c2 += P::Subtract(R0, R0, T0, N2);
2006word t = (Compare(R0, T2, N2) == -1);
2007
2008carry += t;
2009carry += Increment(R0, N2, c2+t);
2010carry += P::Add(R0, R0, T1, N2);
2011carry += P::Add(R0, R0, T3, N2);
2012assert (carry >= 0 && carry <= 2);
2013
2014CopyWords(R1, T3, N2);
2015Increment(R1, N2, carry);
2016}
2017}
2018
2019inline word Add(word *C, const word *A, const word *B, unsigned int N)
2020{
2021return LowLevel::Add(C, A, B, N);
2022}
2023
2024inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
2025{
2026return LowLevel::Subtract(C, A, B, N);
2027}
2028
2029inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
2030{
2031#ifdef SSE2_INTRINSICS_AVAILABLE
2032if (HasSSE2())
2033RecursiveMultiply<P4Optimized>(R, T, A, B, N);
2034else
2035#endif
2036RecursiveMultiply<LowLevel>(R, T, A, B, N);
2037}
2038
2039inline void Square(word *R, word *T, const word *A, unsigned int N)
2040{
2041#ifdef SSE2_INTRINSICS_AVAILABLE
2042if (HasSSE2())
2043RecursiveSquare<P4Optimized>(R, T, A, N);
2044else
2045#endif
2046RecursiveSquare<LowLevel>(R, T, A, N);
2047}
2048
2049inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
2050{
2051#ifdef SSE2_INTRINSICS_AVAILABLE
2052if (HasSSE2())
2053RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
2054else
2055#endif
2056RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
2057}
2058
2059inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
2060{
2061#ifdef SSE2_INTRINSICS_AVAILABLE
2062if (HasSSE2())
2063RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
2064else
2065#endif
2066RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
2067}
2068
2069// R[NA+NB] - result = A*B
2070// T[NA+NB] - temporary work space
2071// A[NA] ---- multiplier
2072// B[NB] ---- multiplicant
2073
2074void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
2075{
2076if (NA == NB)
2077{
2078if (A == B)
2079Square(R, T, A, NA);
2080else
2081Multiply(R, T, A, B, NA);
2082
2083return;
2084}
2085
2086if (NA > NB)
2087{
2088std::swap(A, B);
2089std::swap(NA, NB);
2090}
2091
2092assert(NB % NA == 0);
2093assert((NB/NA)%2 == 0); // NB is an even multiple of NA
2094
2095if (NA==2 && !A[1])
2096{
2097switch (A[0])
2098{
2099case 0:
2100SetWords(R, 0, NB+2);
2101return;
2102case 1:
2103CopyWords(R, B, NB);
2104R[NB] = R[NB+1] = 0;
2105return;
2106default:
2107R[NB] = LinearMultiply(R, B, A[0], NB);
2108R[NB+1] = 0;
2109return;
2110}
2111}
2112
2113Multiply(R, T, A, B, NA);
2114CopyWords(T+2*NA, R+NA, NA);
2115
2116unsigned i;
2117
2118for (i=2*NA; i<NB; i+=2*NA)
2119Multiply(T+NA+i, T, A, B+i, NA);
2120for (i=NA; i<NB; i+=2*NA)
2121Multiply(R+i, T, A, B+i, NA);
2122
2123if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2124Increment(R+NB, NA);
2125}
2126
2127// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2128// T[3*N/2] - temporary work space
2129// A[N] ----- an odd number as input
2130
2131void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
2132{
2133if (N==2)
2134AtomicInverseModPower2(R, A[0], A[1]);
2135else
2136{
2137const unsigned int N2 = N/2;
2138RecursiveInverseModPower2(R0, T0, A0, N2);
2139T0[0] = 1;
2140SetWords(T0+1, 0, N2-1);
2141MultiplyTop(R1, T1, T0, R0, A0, N2);
2142MultiplyBottom(T0, T1, R0, A1, N2);
2143Add(T0, R1, T0, N2);
2144TwosComplement(T0, N2);
2145MultiplyBottom(R1, T1, R0, T0, N2);
2146}
2147}
2148
2149// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2150// T[3*N] - temporary work space
2151// X[2*N] - number to be reduced
2152// M[N] --- modulus
2153// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2154
2155void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
2156{
2157MultiplyBottom(R, T, X, U, N);
2158MultiplyTop(T, T+N, X, R, M, N);
2159word borrow = Subtract(T, X+N, T, N);
2160// defend against timing attack by doing this Add even when not needed
2161word carry = Add(T+N, T, M, N);
2162assert(carry || !borrow);
2163CopyWords(R, T + (borrow ? N : 0), N);
2164}
2165
2166// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2167// T[2*N] - temporary work space
2168// X[2*N] - number to be reduced
2169// M[N] --- modulus
2170// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2171// V[N] --- 2**(WORD_BITS*3*N/2) mod M
2172
2173void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, unsigned int N)
2174{
2175assert(N%2==0 && N>=4);
2176
2177#define M0M
2178#define M1(M+N2)
2179#define V0V
2180#define V1(V+N2)
2181
2182#define X0X
2183#define X1(X+N2)
2184#define X2(X+N)
2185#define X3(X+N+N2)
2186
2187const unsigned int N2 = N/2;
2188Multiply(T0, T2, V0, X3, N2);
2189int c2 = Add(T0, T0, X0, N);
2190MultiplyBottom(T3, T2, T0, U, N2);
2191MultiplyTop(T2, R, T0, T3, M0, N2);
2192c2 -= Subtract(T2, T1, T2, N2);
2193Multiply(T0, R, T3, M1, N2);
2194c2 -= Subtract(T0, T2, T0, N2);
2195int c3 = -(int)Subtract(T1, X2, T1, N2);
2196Multiply(R0, T2, V1, X3, N2);
2197c3 += Add(R, R, T, N);
2198
2199if (c2>0)
2200c3 += Increment(R1, N2);
2201else if (c2<0)
2202c3 -= Decrement(R1, N2, -c2);
2203
2204assert(c3>=-1 && c3<=1);
2205if (c3>0)
2206Subtract(R, R, M, N);
2207else if (c3<0)
2208Add(R, R, M, N);
2209
2210#undef M0
2211#undef M1
2212#undef V0
2213#undef V1
2214
2215#undef X0
2216#undef X1
2217#undef X2
2218#undef X3
2219}
2220
2221#undef A0
2222#undef A1
2223#undef B0
2224#undef B1
2225
2226#undef T0
2227#undef T1
2228#undef T2
2229#undef T3
2230
2231#undef R0
2232#undef R1
2233#undef R2
2234#undef R3
2235
2236// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2237static word SubatomicDivide(word *A, word B0, word B1)
2238{
2239// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2240assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2241
2242dword p, u;
2243word Q;
2244
2245// estimate the quotient: do a 2 word by 1 word divide
2246if (B1+1 == 0)
2247Q = A[2];
2248else
2249Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
2250
2251// now subtract Q*B from A
2252p = (dword) B0*Q;
2253u = (dword) A[0] - LOW_WORD(p);
2254A[0] = LOW_WORD(u);
2255u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
2256A[1] = LOW_WORD(u);
2257A[2] += HIGH_WORD(u);
2258
2259// Q <= actual quotient, so fix it
2260while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2261{
2262u = (dword) A[0] - B0;
2263A[0] = LOW_WORD(u);
2264u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
2265A[1] = LOW_WORD(u);
2266A[2] += HIGH_WORD(u);
2267Q++;
2268assert(Q);// shouldn't overflow
2269}
2270
2271return Q;
2272}
2273
2274// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2275static inline void AtomicDivide(word *Q, const word *A, const word *B)
2276{
2277if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2278{
2279Q[0] = A[2];
2280Q[1] = A[3];
2281}
2282else
2283{
2284word T[4];
2285T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2286Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2287Q[0] = SubatomicDivide(T, B[0], B[1]);
2288
2289#ifndef NDEBUG
2290// multiply quotient and divisor and add remainder, make sure it equals dividend
2291assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2292word P[4];
2293LowLevel::Multiply2(P, Q, B);
2294Add(P, P, T, 4);
2295assert(memcmp(P, A, 4*WORD_SIZE)==0);
2296#endif
2297}
2298}
2299
2300// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
2301static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
2302{
2303assert(N && N%2==0);
2304
2305if (Q[1])
2306{
2307T[N] = T[N+1] = 0;
2308unsigned i;
2309for (i=0; i<N; i+=4)
2310LowLevel::Multiply2(T+i, Q, B+i);
2311for (i=2; i<N; i+=4)
2312if (LowLevel::Multiply2Add(T+i, Q, B+i))
2313T[i+5] += (++T[i+4]==0);
2314}
2315else
2316{
2317T[N] = LinearMultiply(T, B, Q[0], N);
2318T[N+1] = 0;
2319}
2320
2321word borrow = Subtract(R, R, T, N+2);
2322assert(!borrow && !R[N+1]);
2323
2324while (R[N] || Compare(R, B, N) >= 0)
2325{
2326R[N] -= Subtract(R, R, B, N);
2327Q[1] += (++Q[0]==0);
2328assert(Q[0] || Q[1]); // no overflow
2329}
2330}
2331
2332// R[NB] -------- remainder = A%B
2333// Q[NA-NB+2] --- quotient= A/B
2334// T[NA+2*NB+4] - temp work space
2335// A[NA] -------- dividend
2336// B[NB] -------- divisor
2337
2338void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
2339{
2340assert(NA && NB && NA%2==0 && NB%2==0);
2341assert(B[NB-1] || B[NB-2]);
2342assert(NB <= NA);
2343
2344// set up temporary work space
2345word *const TA=T;
2346word *const TB=T+NA+2;
2347word *const TP=T+NA+2+NB;
2348
2349// copy B into TB and normalize it so that TB has highest bit set to 1
2350unsigned shiftWords = (B[NB-1]==0);
2351TB[0] = TB[NB-1] = 0;
2352CopyWords(TB+shiftWords, B, NB-shiftWords);
2353unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2354assert(shiftBits < WORD_BITS);
2355ShiftWordsLeftByBits(TB, NB, shiftBits);
2356
2357// copy A into TA and normalize it
2358TA[0] = TA[NA] = TA[NA+1] = 0;
2359CopyWords(TA+shiftWords, A, NA);
2360ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2361
2362if (TA[NA+1]==0 && TA[NA] <= 1)
2363{
2364Q[NA-NB+1] = Q[NA-NB] = 0;
2365while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2366{
2367TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2368++Q[NA-NB];
2369}
2370}
2371else
2372{
2373NA+=2;
2374assert(Compare(TA+NA-NB, TB, NB) < 0);
2375}
2376
2377word BT[2];
2378BT[0] = TB[NB-2] + 1;
2379BT[1] = TB[NB-1] + (BT[0]==0);
2380
2381// start reducing TA mod TB, 2 words at a time
2382for (unsigned i=NA-2; i>=NB; i-=2)
2383{
2384AtomicDivide(Q+i-NB, TA+i-2, BT);
2385CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2386}
2387
2388// copy TA into R, and denormalize it
2389CopyWords(R, TA+shiftWords, NB);
2390ShiftWordsRightByBits(R, NB, shiftBits);
2391}
2392
2393static inline unsigned int EvenWordCount(const word *X, unsigned int N)
2394{
2395while (N && X[N-2]==0 && X[N-1]==0)
2396N-=2;
2397return N;
2398}
2399
2400// return k
2401// R[N] --- result = A^(-1) * 2^k mod M
2402// T[4*N] - temporary work space
2403// A[NA] -- number to take inverse of
2404// M[N] --- modulus
2405
2406unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
2407{
2408assert(NA<=N && N && N%2==0);
2409
2410word *b = T;
2411word *c = T+N;
2412word *f = T+2*N;
2413word *g = T+3*N;
2414unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
2415unsigned int k=0, s=0;
2416
2417SetWords(T, 0, 3*N);
2418b[0]=1;
2419CopyWords(f, A, NA);
2420CopyWords(g, M, N);
2421
2422while (1)
2423{
2424word t=f[0];
2425while (!t)
2426{
2427if (EvenWordCount(f, fgLen)==0)
2428{
2429SetWords(R, 0, N);
2430return 0;
2431}
2432
2433ShiftWordsRightByWords(f, fgLen, 1);
2434if (c[bcLen-1]) bcLen+=2;
2435assert(bcLen <= N);
2436ShiftWordsLeftByWords(c, bcLen, 1);
2437k+=WORD_BITS;
2438t=f[0];
2439}
2440
2441unsigned int i=0;
2442while (t%2 == 0)
2443{
2444t>>=1;
2445i++;
2446}
2447k+=i;
2448
2449if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2450{
2451if (s%2==0)
2452CopyWords(R, b, N);
2453else
2454Subtract(R, M, b, N);
2455return k;
2456}
2457
2458ShiftWordsRightByBits(f, fgLen, i);
2459t=ShiftWordsLeftByBits(c, bcLen, i);
2460if (t)
2461{
2462c[bcLen] = t;
2463bcLen+=2;
2464assert(bcLen <= N);
2465}
2466
2467if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2468fgLen-=2;
2469
2470if (Compare(f, g, fgLen)==-1)
2471{
2472std::swap(f, g);
2473std::swap(b, c);
2474s++;
2475}
2476
2477Subtract(f, f, g, fgLen);
2478
2479if (Add(b, b, c, bcLen))
2480{
2481b[bcLen] = 1;
2482bcLen+=2;
2483assert(bcLen <= N);
2484}
2485}
2486}
2487
2488// R[N] - result = A/(2^k) mod M
2489// A[N] - input
2490// M[N] - modulus
2491
2492void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
2493{
2494CopyWords(R, A, N);
2495
2496while (k--)
2497{
2498if (R[0]%2==0)
2499ShiftWordsRightByBits(R, N, 1);
2500else
2501{
2502word carry = Add(R, R, M, N);
2503ShiftWordsRightByBits(R, N, 1);
2504R[N-1] += carry<<(WORD_BITS-1);
2505}
2506}
2507}
2508
2509// R[N] - result = A*(2^k) mod M
2510// A[N] - input
2511// M[N] - modulus
2512
2513void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
2514{
2515CopyWords(R, A, N);
2516
2517while (k--)
2518if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2519Subtract(R, R, M, N);
2520}
2521
2522// ******************************************************************
2523
2524static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2525
2526static inline unsigned int RoundupSize(unsigned int n)
2527{
2528if (n<=8)
2529return RoundupSizeTable[n];
2530else if (n<=16)
2531return 16;
2532else if (n<=32)
2533return 32;
2534else if (n<=64)
2535return 64;
2536else return 1U << BitPrecision(n-1);
2537}
2538
2539Integer::Integer()
2540: reg(2), sign(POSITIVE)
2541{
2542reg[0] = reg[1] = 0;
2543}
2544
2545Integer::Integer(const Integer& t)
2546: reg(RoundupSize(t.WordCount())), sign(t.sign)
2547{
2548CopyWords(reg, t.reg, reg.size());
2549}
2550
2551Integer::Integer(signed long value)
2552: reg(2)
2553{
2554if (value >= 0)
2555sign = POSITIVE;
2556else
2557{
2558sign = NEGATIVE;
2559value = -value;
2560}
2561reg[0] = word(value);
2562reg[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
2563}
2564
2565Integer::Integer(Sign s, word high, word low)
2566: reg(2), sign(s)
2567{
2568reg[0] = low;
2569reg[1] = high;
2570}
2571
2572bool Integer::IsConvertableToLong() const
2573{
2574if (ByteCount() > sizeof(long))
2575return false;
2576
2577unsigned long value = reg[0];
2578value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);
2579
2580if (sign==POSITIVE)
2581return (signed long)value >= 0;
2582else
2583return -(signed long)value < 0;
2584}
2585
2586signed long Integer::ConvertToLong() const
2587{
2588assert(IsConvertableToLong());
2589
2590unsigned long value = reg[0];
2591value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);
2592return sign==POSITIVE ? value : -(signed long)value;
2593}
2594
2595Integer::Integer(BufferedTransformation &encodedInteger, unsigned int byteCount, Signedness s)
2596{
2597Decode(encodedInteger, byteCount, s);
2598}
2599
2600Integer::Integer(const byte *encodedInteger, unsigned int byteCount, Signedness s)
2601{
2602Decode(encodedInteger, byteCount, s);
2603}
2604
2605Integer::Integer(BufferedTransformation &bt)
2606{
2607BERDecode(bt);
2608}
2609
2610Integer::Integer(RandomNumberGenerator &rng, unsigned int bitcount)
2611{
2612Randomize(rng, bitcount);
2613}
2614
2615Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
2616{
2617if (!Randomize(rng, min, max, rnType, equiv, mod))
2618throw Integer::RandomNumberNotFound();
2619}
2620
2621Integer Integer::Power2(unsigned int e)
2622{
2623Integer r((word)0, BitsToWords(e+1));
2624r.SetBit(e);
2625return r;
2626}
2627
2628const Integer &Integer::Zero()
2629{
2630static const Integer zero;
2631return zero;
2632}
2633
2634const Integer &Integer::One()
2635{
2636static const Integer one(1,2);
2637return one;
2638}
2639
2640const Integer &Integer::Two()
2641{
2642static const Integer two(2,2);
2643return two;
2644}
2645
2646bool Integer::operator!() const
2647{
2648return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
2649}
2650
2651Integer& Integer::operator=(const Integer& t)
2652{
2653if (this != &t)
2654{
2655reg.New(RoundupSize(t.WordCount()));
2656CopyWords(reg, t.reg, reg.size());
2657sign = t.sign;
2658}
2659return *this;
2660}
2661
2662bool Integer::GetBit(unsigned int n) const
2663{
2664if (n/WORD_BITS >= reg.size())
2665return 0;
2666else
2667return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
2668}
2669
2670void Integer::SetBit(unsigned int n, bool value)
2671{
2672if (value)
2673{
2674reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
2675reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
2676}
2677else
2678{
2679if (n/WORD_BITS < reg.size())
2680reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
2681}
2682}
2683
2684byte Integer::GetByte(unsigned int n) const
2685{
2686if (n/WORD_SIZE >= reg.size())
2687return 0;
2688else
2689return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
2690}
2691
2692void Integer::SetByte(unsigned int n, byte value)
2693{
2694reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
2695reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2696reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2697}
2698
2699unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
2700{
2701assert(n <= sizeof(unsigned long)*8);
2702unsigned long v = 0;
2703for (unsigned int j=0; j<n; j++)
2704v |= GetBit(i+j) << j;
2705return v;
2706}
2707
2708Integer Integer::operator-() const
2709{
2710Integer result(*this);
2711result.Negate();
2712return result;
2713}
2714
2715Integer Integer::AbsoluteValue() const
2716{
2717Integer result(*this);
2718result.sign = POSITIVE;
2719return result;
2720}
2721
2722void Integer::swap(Integer &a)
2723{
2724reg.swap(a.reg);
2725std::swap(sign, a.sign);
2726}
2727
2728Integer::Integer(word value, unsigned int length)
2729: reg(RoundupSize(length)), sign(POSITIVE)
2730{
2731reg[0] = value;
2732SetWords(reg+1, 0, reg.size()-1);
2733}
2734
2735template <class T>
2736static Integer StringToInteger(const T *str)
2737{
2738word radix;
2739#if (defined(__GNUC__) && __GNUC__ <= 3)// GCC workaround
2740// std::char_traits doesn't exist in GCC 2.x
2741// std::char_traits<wchar_t>::length() not defined in GCC 3.2
2742unsigned int length;
2743for (length = 0; str[length] != 0; length++) {}
2744#else
2745unsigned int length = std::char_traits<T>::length(str);
2746#endif
2747
2748Integer v;
2749
2750if (length == 0)
2751return v;
2752
2753switch (str[length-1])
2754{
2755case 'h':
2756case 'H':
2757radix=16;
2758break;
2759case 'o':
2760case 'O':
2761radix=8;
2762break;
2763case 'b':
2764case 'B':
2765radix=2;
2766break;
2767default:
2768radix=10;
2769}
2770
2771if (length > 2 && str[0] == '0' && str[1] == 'x')
2772radix = 16;
2773
2774for (unsigned i=0; i<length; i++)
2775{
2776word digit;
2777
2778if (str[i] >= '0' && str[i] <= '9')
2779digit = str[i] - '0';
2780else if (str[i] >= 'A' && str[i] <= 'F')
2781digit = str[i] - 'A' + 10;
2782else if (str[i] >= 'a' && str[i] <= 'f')
2783digit = str[i] - 'a' + 10;
2784else
2785digit = radix;
2786
2787if (digit < radix)
2788{
2789v *= radix;
2790v += digit;
2791}
2792}
2793
2794if (str[0] == '-')
2795v.Negate();
2796
2797return v;
2798}
2799
2800Integer::Integer(const char *str)
2801: reg(2), sign(POSITIVE)
2802{
2803*this = StringToInteger(str);
2804}
2805
2806Integer::Integer(const wchar_t *str)
2807: reg(2), sign(POSITIVE)
2808{
2809*this = StringToInteger(str);
2810}
2811
2812unsigned int Integer::WordCount() const
2813{
2814return CountWords(reg, reg.size());
2815}
2816
2817unsigned int Integer::ByteCount() const
2818{
2819unsigned wordCount = WordCount();
2820if (wordCount)
2821return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
2822else
2823return 0;
2824}
2825
2826unsigned int Integer::BitCount() const
2827{
2828unsigned wordCount = WordCount();
2829if (wordCount)
2830return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
2831else
2832return 0;
2833}
2834
2835void Integer::Decode(const byte *input, unsigned int inputLen, Signedness s)
2836{
2837StringStore store(input, inputLen);
2838Decode(store, inputLen, s);
2839}
2840
2841void Integer::Decode(BufferedTransformation &bt, unsigned int inputLen, Signedness s)
2842{
2843assert(bt.MaxRetrievable() >= inputLen);
2844
2845byte b;
2846bt.Peek(b);
2847sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
2848
2849while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
2850{
2851bt.Skip(1);
2852inputLen--;
2853bt.Peek(b);
2854}
2855
2856reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
2857
2858for (unsigned int i=inputLen; i > 0; i--)
2859{
2860bt.Get(b);
2861reg[(i-1)/WORD_SIZE] |= b << ((i-1)%WORD_SIZE)*8;
2862}
2863
2864if (sign == NEGATIVE)
2865{
2866for (unsigned i=inputLen; i<reg.size()*WORD_SIZE; i++)
2867reg[i/WORD_SIZE] |= 0xff << (i%WORD_SIZE)*8;
2868TwosComplement(reg, reg.size());
2869}
2870}
2871
2872unsigned int Integer::MinEncodedSize(Signedness signedness) const
2873{
2874unsigned int outputLen = STDMAX(1U, ByteCount());
2875if (signedness == UNSIGNED)
2876return outputLen;
2877if (NotNegative() && (GetByte(outputLen-1) & 0x80))
2878outputLen++;
2879if (IsNegative() && *this < -Power2(outputLen*8-1))
2880outputLen++;
2881return outputLen;
2882}
2883
2884unsigned int Integer::Encode(byte *output, unsigned int outputLen, Signedness signedness) const
2885{
2886ArraySink sink(output, outputLen);
2887return Encode(sink, outputLen, signedness);
2888}
2889
2890unsigned int Integer::Encode(BufferedTransformation &bt, unsigned int outputLen, Signedness signedness) const
2891{
2892if (signedness == UNSIGNED || NotNegative())
2893{
2894for (unsigned int i=outputLen; i > 0; i--)
2895bt.Put(GetByte(i-1));
2896}
2897else
2898{
2899// take two's complement of *this
2900Integer temp = Integer::Power2(8*STDMAX(ByteCount(), outputLen)) + *this;
2901for (unsigned i=0; i<outputLen; i++)
2902bt.Put(temp.GetByte(outputLen-i-1));
2903}
2904return outputLen;
2905}
2906
2907void Integer::DEREncode(BufferedTransformation &bt) const
2908{
2909DERGeneralEncoder enc(bt, INTEGER);
2910Encode(enc, MinEncodedSize(SIGNED), SIGNED);
2911enc.MessageEnd();
2912}
2913
2914void Integer::BERDecode(const byte *input, unsigned int len)
2915{
2916StringStore store(input, len);
2917BERDecode(store);
2918}
2919
2920void Integer::BERDecode(BufferedTransformation &bt)
2921{
2922BERGeneralDecoder dec(bt, INTEGER);
2923if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
2924BERDecodeError();
2925Decode(dec, dec.RemainingLength(), SIGNED);
2926dec.MessageEnd();
2927}
2928
2929void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, unsigned int length) const
2930{
2931DERGeneralEncoder enc(bt, OCTET_STRING);
2932Encode(enc, length);
2933enc.MessageEnd();
2934}
2935
2936void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, unsigned int length)
2937{
2938BERGeneralDecoder dec(bt, OCTET_STRING);
2939if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
2940BERDecodeError();
2941Decode(dec, length);
2942dec.MessageEnd();
2943}
2944
2945unsigned int Integer::OpenPGPEncode(byte *output, unsigned int len) const
2946{
2947ArraySink sink(output, len);
2948return OpenPGPEncode(sink);
2949}
2950
2951unsigned int Integer::OpenPGPEncode(BufferedTransformation &bt) const
2952{
2953word16 bitCount = BitCount();
2954bt.PutWord16(bitCount);
2955return 2 + Encode(bt, BitsToBytes(bitCount));
2956}
2957
2958void Integer::OpenPGPDecode(const byte *input, unsigned int len)
2959{
2960StringStore store(input, len);
2961OpenPGPDecode(store);
2962}
2963
2964void Integer::OpenPGPDecode(BufferedTransformation &bt)
2965{
2966word16 bitCount;
2967if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
2968throw OpenPGPDecodeErr();
2969Decode(bt, BitsToBytes(bitCount));
2970}
2971
2972void Integer::Randomize(RandomNumberGenerator &rng, unsigned int nbits)
2973{
2974const unsigned int nbytes = nbits/8 + 1;
2975SecByteBlock buf(nbytes);
2976rng.GenerateBlock(buf, nbytes);
2977if (nbytes)
2978buf[0] = (byte)Crop(buf[0], nbits % 8);
2979Decode(buf, nbytes, UNSIGNED);
2980}
2981
2982void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
2983{
2984if (min > max)
2985throw InvalidArgument("Integer: Min must be no greater than Max");
2986
2987Integer range = max - min;
2988const unsigned int nbits = range.BitCount();
2989
2990do
2991{
2992Randomize(rng, nbits);
2993}
2994while (*this > range);
2995
2996*this += min;
2997}
2998
2999bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3000{
3001return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3002}
3003
3004class KDF2_RNG : public RandomNumberGenerator
3005{
3006public:
3007KDF2_RNG(const byte *seed, unsigned int seedSize)
3008: m_counter(0), m_counterAndSeed(seedSize + 4)
3009{
3010memcpy(m_counterAndSeed + 4, seed, seedSize);
3011}
3012
3013byte GenerateByte()
3014{
3015byte b;
3016GenerateBlock(&b, 1);
3017return b;
3018}
3019
3020void GenerateBlock(byte *output, unsigned int size)
3021{
3022UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
3023++m_counter;
3024P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size());
3025}
3026
3027private:
3028word32 m_counter;
3029SecByteBlock m_counterAndSeed;
3030};
3031
3032bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3033{
3034Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3035Integer max;
3036if (!params.GetValue("Max", max))
3037{
3038int bitLength;
3039if (params.GetIntValue("BitLength", bitLength))
3040max = Integer::Power2(bitLength);
3041else
3042throw InvalidArgument("Integer: missing Max argument");
3043}
3044if (min > max)
3045throw InvalidArgument("Integer: Min must be no greater than Max");
3046
3047Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3048Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3049
3050if (equiv.IsNegative() || equiv >= mod)
3051throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3052
3053Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3054
3055member_ptr<KDF2_RNG> kdf2Rng;
3056ConstByteArrayParameter seed;
3057if (params.GetValue("Seed", seed))
3058{
3059ByteQueue bq;
3060DERSequenceEncoder seq(bq);
3061min.DEREncode(seq);
3062max.DEREncode(seq);
3063equiv.DEREncode(seq);
3064mod.DEREncode(seq);
3065DEREncodeUnsigned(seq, rnType);
3066DEREncodeOctetString(seq, seed.begin(), seed.size());
3067seq.MessageEnd();
3068
3069SecByteBlock finalSeed(bq.MaxRetrievable());
3070bq.Get(finalSeed, finalSeed.size());
3071kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3072}
3073RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3074
3075switch (rnType)
3076{
3077case ANY:
3078if (mod == One())
3079Randomize(rng, min, max);
3080else
3081{
3082Integer min1 = min + (equiv-min)%mod;
3083if (max < min1)
3084return false;
3085Randomize(rng, Zero(), (max - min1) / mod);
3086*this *= mod;
3087*this += min1;
3088}
3089return true;
3090
3091case PRIME:
3092{
3093const PrimeSelector *pSelector = params.GetValueWithDefault("PointerToPrimeSelector", (const PrimeSelector *)NULL);
3094
3095int i;
3096i = 0;
3097while (1)
3098{
3099if (++i==16)
3100{
3101// check if there are any suitable primes in [min, max]
3102Integer first = min;
3103if (FirstPrime(first, max, equiv, mod, pSelector))
3104{
3105// if there is only one suitable prime, we're done
3106*this = first;
3107if (!FirstPrime(first, max, equiv, mod, pSelector))
3108return true;
3109}
3110else
3111return false;
3112}
3113
3114Randomize(rng, min, max);
3115if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3116return true;
3117}
3118}
3119
3120default:
3121throw InvalidArgument("Integer: invalid RandomNumberType argument");
3122}
3123}
3124
3125std::istream& operator>>(std::istream& in, Integer &a)
3126{
3127char c;
3128unsigned int length = 0;
3129SecBlock<char> str(length + 16);
3130
3131std::ws(in);
3132
3133do
3134{
3135in.read(&c, 1);
3136str[length++] = c;
3137if (length >= str.size())
3138str.Grow(length + 16);
3139}
3140while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3141
3142if (in.gcount())
3143in.putback(c);
3144str[length-1] = '\0';
3145a = Integer(str);
3146
3147return in;
3148}
3149
3150std::ostream& operator<<(std::ostream& out, const Integer &a)
3151{
3152// Get relevant conversion specifications from ostream.
3153long f = out.flags() & std::ios::basefield; // Get base digits.
3154int base, block;
3155char suffix;
3156switch(f)
3157{
3158case std::ios::oct :
3159base = 8;
3160block = 8;
3161suffix = 'o';
3162break;
3163case std::ios::hex :
3164base = 16;
3165block = 4;
3166suffix = 'h';
3167break;
3168default :
3169base = 10;
3170block = 3;
3171suffix = '.';
3172}
3173
3174SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
3175Integer temp1=a, temp2;
3176unsigned i=0;
3177const char vec[]="0123456789ABCDEF";
3178
3179if (a.IsNegative())
3180{
3181out << '-';
3182temp1.Negate();
3183}
3184
3185if (!a)
3186out << '0';
3187
3188while (!!temp1)
3189{
3190word digit;
3191Integer::Divide(digit, temp2, temp1, base);
3192s[i++]=vec[digit];
3193temp1=temp2;
3194}
3195
3196while (i--)
3197{
3198out << s[i];
3199//if (i && !(i%block))
3200//out << ",";
3201}
3202return out << suffix;
3203}
3204
3205Integer& Integer::operator++()
3206{
3207if (NotNegative())
3208{
3209if (Increment(reg, reg.size()))
3210{
3211reg.CleanGrow(2*reg.size());
3212reg[reg.size()/2]=1;
3213}
3214}
3215else
3216{
3217word borrow = Decrement(reg, reg.size());
3218assert(!borrow);
3219if (WordCount()==0)
3220*this = Zero();
3221}
3222return *this;
3223}
3224
3225Integer& Integer::operator--()
3226{
3227if (IsNegative())
3228{
3229if (Increment(reg, reg.size()))
3230{
3231reg.CleanGrow(2*reg.size());
3232reg[reg.size()/2]=1;
3233}
3234}
3235else
3236{
3237if (Decrement(reg, reg.size()))
3238*this = -One();
3239}
3240return *this;
3241}
3242
3243void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3244{
3245word carry;
3246if (a.reg.size() == b.reg.size())
3247carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3248else if (a.reg.size() > b.reg.size())
3249{
3250carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3251CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3252carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3253}
3254else
3255{
3256carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3257CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3258carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3259}
3260
3261if (carry)
3262{
3263sum.reg.CleanGrow(2*sum.reg.size());
3264sum.reg[sum.reg.size()/2] = 1;
3265}
3266sum.sign = Integer::POSITIVE;
3267}
3268
3269void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3270{
3271unsigned aSize = a.WordCount();
3272aSize += aSize%2;
3273unsigned bSize = b.WordCount();
3274bSize += bSize%2;
3275
3276if (aSize == bSize)
3277{
3278if (Compare(a.reg, b.reg, aSize) >= 0)
3279{
3280Subtract(diff.reg, a.reg, b.reg, aSize);
3281diff.sign = Integer::POSITIVE;
3282}
3283else
3284{
3285Subtract(diff.reg, b.reg, a.reg, aSize);
3286diff.sign = Integer::NEGATIVE;
3287}
3288}
3289else if (aSize > bSize)
3290{
3291word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3292CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3293borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3294assert(!borrow);
3295diff.sign = Integer::POSITIVE;
3296}
3297else
3298{
3299word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3300CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3301borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3302assert(!borrow);
3303diff.sign = Integer::NEGATIVE;
3304}
3305}
3306
3307Integer Integer::Plus(const Integer& b) const
3308{
3309Integer sum((word)0, STDMAX(reg.size(), b.reg.size()));
3310if (NotNegative())
3311{
3312if (b.NotNegative())
3313PositiveAdd(sum, *this, b);
3314else
3315PositiveSubtract(sum, *this, b);
3316}
3317else
3318{
3319if (b.NotNegative())
3320PositiveSubtract(sum, b, *this);
3321else
3322{
3323PositiveAdd(sum, *this, b);
3324sum.sign = Integer::NEGATIVE;
3325}
3326}
3327return sum;
3328}
3329
3330Integer& Integer::operator+=(const Integer& t)
3331{
3332reg.CleanGrow(t.reg.size());
3333if (NotNegative())
3334{
3335if (t.NotNegative())
3336PositiveAdd(*this, *this, t);
3337else
3338PositiveSubtract(*this, *this, t);
3339}
3340else
3341{
3342if (t.NotNegative())
3343PositiveSubtract(*this, t, *this);
3344else
3345{
3346PositiveAdd(*this, *this, t);
3347sign = Integer::NEGATIVE;
3348}
3349}
3350return *this;
3351}
3352
3353Integer Integer::Minus(const Integer& b) const
3354{
3355Integer diff((word)0, STDMAX(reg.size(), b.reg.size()));
3356if (NotNegative())
3357{
3358if (b.NotNegative())
3359PositiveSubtract(diff, *this, b);
3360else
3361PositiveAdd(diff, *this, b);
3362}
3363else
3364{
3365if (b.NotNegative())
3366{
3367PositiveAdd(diff, *this, b);
3368diff.sign = Integer::NEGATIVE;
3369}
3370else
3371PositiveSubtract(diff, b, *this);
3372}
3373return diff;
3374}
3375
3376Integer& Integer::operator-=(const Integer& t)
3377{
3378reg.CleanGrow(t.reg.size());
3379if (NotNegative())
3380{
3381if (t.NotNegative())
3382PositiveSubtract(*this, *this, t);
3383else
3384PositiveAdd(*this, *this, t);
3385}
3386else
3387{
3388if (t.NotNegative())
3389{
3390PositiveAdd(*this, *this, t);
3391sign = Integer::NEGATIVE;
3392}
3393else
3394PositiveSubtract(*this, t, *this);
3395}
3396return *this;
3397}
3398
3399Integer& Integer::operator<<=(unsigned int n)
3400{
3401const unsigned int wordCount = WordCount();
3402const unsigned int shiftWords = n / WORD_BITS;
3403const unsigned int shiftBits = n % WORD_BITS;
3404
3405reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
3406ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
3407ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
3408return *this;
3409}
3410
3411Integer& Integer::operator>>=(unsigned int n)
3412{
3413const unsigned int wordCount = WordCount();
3414const unsigned int shiftWords = n / WORD_BITS;
3415const unsigned int shiftBits = n % WORD_BITS;
3416
3417ShiftWordsRightByWords(reg, wordCount, shiftWords);
3418if (wordCount > shiftWords)
3419ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
3420if (IsNegative() && WordCount()==0) // avoid -0
3421*this = Zero();
3422return *this;
3423}
3424
3425void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
3426{
3427unsigned aSize = RoundupSize(a.WordCount());
3428unsigned bSize = RoundupSize(b.WordCount());
3429
3430product.reg.CleanNew(RoundupSize(aSize+bSize));
3431product.sign = Integer::POSITIVE;
3432
3433SecAlignedWordBlock workspace(aSize + bSize);
3434AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
3435}
3436
3437void Multiply(Integer &product, const Integer &a, const Integer &b)
3438{
3439PositiveMultiply(product, a, b);
3440
3441if (a.NotNegative() != b.NotNegative())
3442product.Negate();
3443}
3444
3445Integer Integer::Times(const Integer &b) const
3446{
3447Integer product;
3448Multiply(product, *this, b);
3449return product;
3450}
3451
3452/*
3453void PositiveDivide(Integer &remainder, Integer &quotient,
3454 const Integer &dividend, const Integer &divisor)
3455{
3456remainder.reg.CleanNew(divisor.reg.size());
3457remainder.sign = Integer::POSITIVE;
3458quotient.reg.New(0);
3459quotient.sign = Integer::POSITIVE;
3460unsigned i=dividend.BitCount();
3461while (i--)
3462{
3463word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
3464remainder.reg[0] |= dividend[i];
3465if (overflow || remainder >= divisor)
3466{
3467Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
3468quotient.SetBit(i);
3469}
3470}
3471}
3472*/
3473
3474void PositiveDivide(Integer &remainder, Integer &quotient,
3475 const Integer &a, const Integer &b)
3476{
3477unsigned aSize = a.WordCount();
3478unsigned bSize = b.WordCount();
3479
3480if (!bSize)
3481throw Integer::DivideByZero();
3482
3483if (a.PositiveCompare(b) == -1)
3484{
3485remainder = a;
3486remainder.sign = Integer::POSITIVE;
3487quotient = Integer::Zero();
3488return;
3489}
3490
3491aSize += aSize%2;// round up to next even number
3492bSize += bSize%2;
3493
3494remainder.reg.CleanNew(RoundupSize(bSize));
3495remainder.sign = Integer::POSITIVE;
3496quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
3497quotient.sign = Integer::POSITIVE;
3498
3499SecAlignedWordBlock T(aSize+2*bSize+4);
3500Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
3501}
3502
3503void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
3504{
3505PositiveDivide(remainder, quotient, dividend, divisor);
3506
3507if (dividend.IsNegative())
3508{
3509quotient.Negate();
3510if (remainder.NotZero())
3511{
3512--quotient;
3513remainder = divisor.AbsoluteValue() - remainder;
3514}
3515}
3516
3517if (divisor.IsNegative())
3518quotient.Negate();
3519}
3520
3521void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
3522{
3523q = a;
3524q >>= n;
3525
3526const unsigned int wordCount = BitsToWords(n);
3527if (wordCount <= a.WordCount())
3528{
3529r.reg.resize(RoundupSize(wordCount));
3530CopyWords(r.reg, a.reg, wordCount);
3531SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
3532if (n % WORD_BITS != 0)
3533r.reg[wordCount-1] %= (1 << (n % WORD_BITS));
3534}
3535else
3536{
3537r.reg.resize(RoundupSize(a.WordCount()));
3538CopyWords(r.reg, a.reg, r.reg.size());
3539}
3540r.sign = POSITIVE;
3541
3542if (a.IsNegative() && r.NotZero())
3543{
3544--q;
3545r = Power2(n) - r;
3546}
3547}
3548
3549Integer Integer::DividedBy(const Integer &b) const
3550{
3551Integer remainder, quotient;
3552Integer::Divide(remainder, quotient, *this, b);
3553return quotient;
3554}
3555
3556Integer Integer::Modulo(const Integer &b) const
3557{
3558Integer remainder, quotient;
3559Integer::Divide(remainder, quotient, *this, b);
3560return remainder;
3561}
3562
3563void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
3564{
3565if (!divisor)
3566throw Integer::DivideByZero();
3567
3568assert(divisor);
3569
3570if ((divisor & (divisor-1)) == 0)// divisor is a power of 2
3571{
3572quotient = dividend >> (BitPrecision(divisor)-1);
3573remainder = dividend.reg[0] & (divisor-1);
3574return;
3575}
3576
3577unsigned int i = dividend.WordCount();
3578quotient.reg.CleanNew(RoundupSize(i));
3579remainder = 0;
3580while (i--)
3581{
3582quotient.reg[i] = word(MAKE_DWORD(dividend.reg[i], remainder) / divisor);
3583remainder = word(MAKE_DWORD(dividend.reg[i], remainder) % divisor);
3584}
3585
3586if (dividend.NotNegative())
3587quotient.sign = POSITIVE;
3588else
3589{
3590quotient.sign = NEGATIVE;
3591if (remainder)
3592{
3593--quotient;
3594remainder = divisor - remainder;
3595}
3596}
3597}
3598
3599Integer Integer::DividedBy(word b) const
3600{
3601word remainder;
3602Integer quotient;
3603Integer::Divide(remainder, quotient, *this, b);
3604return quotient;
3605}
3606
3607word Integer::Modulo(word divisor) const
3608{
3609if (!divisor)
3610throw Integer::DivideByZero();
3611
3612assert(divisor);
3613
3614word remainder;
3615
3616if ((divisor & (divisor-1)) == 0)// divisor is a power of 2
3617remainder = reg[0] & (divisor-1);
3618else
3619{
3620unsigned int i = WordCount();
3621
3622if (divisor <= 5)
3623{
3624dword sum=0;
3625while (i--)
3626sum += reg[i];
3627remainder = word(sum%divisor);
3628}
3629else
3630{
3631remainder = 0;
3632while (i--)
3633remainder = word(MAKE_DWORD(reg[i], remainder) % divisor);
3634}
3635}
3636
3637if (IsNegative() && remainder)
3638remainder = divisor - remainder;
3639
3640return remainder;
3641}
3642
3643void Integer::Negate()
3644{
3645if (!!(*this))// don't flip sign if *this==0
3646sign = Sign(1-sign);
3647}
3648
3649int Integer::PositiveCompare(const Integer& t) const
3650{
3651unsigned size = WordCount(), tSize = t.WordCount();
3652
3653if (size == tSize)
3654return CryptoPP::Compare(reg, t.reg, size);
3655else
3656return size > tSize ? 1 : -1;
3657}
3658
3659int Integer::Compare(const Integer& t) const
3660{
3661if (NotNegative())
3662{
3663if (t.NotNegative())
3664return PositiveCompare(t);
3665else
3666return 1;
3667}
3668else
3669{
3670if (t.NotNegative())
3671return -1;
3672else
3673return -PositiveCompare(t);
3674}
3675}
3676
3677Integer Integer::SquareRoot() const
3678{
3679if (!IsPositive())
3680return Zero();
3681
3682// overestimate square root
3683Integer x, y = Power2((BitCount()+1)/2);
3684assert(y*y >= *this);
3685
3686do
3687{
3688x = y;
3689y = (x + *this/x) >> 1;
3690} while (y<x);
3691
3692return x;
3693}
3694
3695bool Integer::IsSquare() const
3696{
3697Integer r = SquareRoot();
3698return *this == r.Squared();
3699}
3700
3701bool Integer::IsUnit() const
3702{
3703return (WordCount() == 1) && (reg[0] == 1);
3704}
3705
3706Integer Integer::MultiplicativeInverse() const
3707{
3708return IsUnit() ? *this : Zero();
3709}
3710
3711Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3712{
3713return x*y%m;
3714}
3715
3716Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3717{
3718ModularArithmetic mr(m);
3719return mr.Exponentiate(x, e);
3720}
3721
3722Integer Integer::Gcd(const Integer &a, const Integer &b)
3723{
3724return EuclideanDomainOf<Integer>().Gcd(a, b);
3725}
3726
3727Integer Integer::InverseMod(const Integer &m) const
3728{
3729assert(m.NotNegative());
3730
3731if (IsNegative() || *this>=m)
3732return (*this%m).InverseMod(m);
3733
3734if (m.IsEven())
3735{
3736if (!m || IsEven())
3737return Zero();// no inverse
3738if (*this == One())
3739return One();
3740
3741Integer u = m.InverseMod(*this);
3742return !u ? Zero() : (m*(*this-u)+1)/(*this);
3743}
3744
3745SecBlock<word> T(m.reg.size() * 4);
3746Integer r((word)0, m.reg.size());
3747unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
3748DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
3749return r;
3750}
3751
3752word Integer::InverseMod(const word mod) const
3753{
3754word g0 = mod, g1 = *this % mod;
3755word v0 = 0, v1 = 1;
3756word y;
3757
3758while (g1)
3759{
3760if (g1 == 1)
3761return v1;
3762y = g0 / g1;
3763g0 = g0 % g1;
3764v0 += y * v1;
3765
3766if (!g0)
3767break;
3768if (g0 == 1)
3769return mod-v0;
3770y = g1 / g0;
3771g1 = g1 % g0;
3772v1 += y * v0;
3773}
3774return 0;
3775}
3776
3777// ********************************************************
3778
3779ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
3780{
3781BERSequenceDecoder seq(bt);
3782OID oid(seq);
3783if (oid != ASN1::prime_field())
3784BERDecodeError();
3785modulus.BERDecode(seq);
3786seq.MessageEnd();
3787result.reg.resize(modulus.reg.size());
3788}
3789
3790void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
3791{
3792DERSequenceEncoder seq(bt);
3793ASN1::prime_field().DEREncode(seq);
3794modulus.DEREncode(seq);
3795seq.MessageEnd();
3796}
3797
3798void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
3799{
3800a.DEREncodeAsOctetString(out, MaxElementByteLength());
3801}
3802
3803void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
3804{
3805a.BERDecodeAsOctetString(in, MaxElementByteLength());
3806}
3807
3808const Integer& ModularArithmetic::Half(const Integer &a) const
3809{
3810if (a.reg.size()==modulus.reg.size())
3811{
3812CryptoPP::DivideByPower2Mod(result.reg.begin(), a.reg, 1, modulus.reg, a.reg.size());
3813return result;
3814}
3815else
3816return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
3817}
3818
3819const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
3820{
3821if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
3822{
3823if (CryptoPP::Add(result.reg.begin(), a.reg, b.reg, a.reg.size())
3824|| Compare(result.reg, modulus.reg, a.reg.size()) >= 0)
3825{
3826CryptoPP::Subtract(result.reg.begin(), result.reg, modulus.reg, a.reg.size());
3827}
3828return result;
3829}
3830else
3831{
3832result1 = a+b;
3833if (result1 >= modulus)
3834result1 -= modulus;
3835return result1;
3836}
3837}
3838
3839Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
3840{
3841if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
3842{
3843if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
3844|| Compare(a.reg, modulus.reg, a.reg.size()) >= 0)
3845{
3846CryptoPP::Subtract(a.reg, a.reg, modulus.reg, a.reg.size());
3847}
3848}
3849else
3850{
3851a+=b;
3852if (a>=modulus)
3853a-=modulus;
3854}
3855
3856return a;
3857}
3858
3859const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
3860{
3861if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
3862{
3863if (CryptoPP::Subtract(result.reg.begin(), a.reg, b.reg, a.reg.size()))
3864CryptoPP::Add(result.reg.begin(), result.reg, modulus.reg, a.reg.size());
3865return result;
3866}
3867else
3868{
3869result1 = a-b;
3870if (result1.IsNegative())
3871result1 += modulus;
3872return result1;
3873}
3874}
3875
3876Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
3877{
3878if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
3879{
3880if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
3881CryptoPP::Add(a.reg, a.reg, modulus.reg, a.reg.size());
3882}
3883else
3884{
3885a-=b;
3886if (a.IsNegative())
3887a+=modulus;
3888}
3889
3890return a;
3891}
3892
3893const Integer& ModularArithmetic::Inverse(const Integer &a) const
3894{
3895if (!a)
3896return a;
3897
3898CopyWords(result.reg.begin(), modulus.reg, modulus.reg.size());
3899if (CryptoPP::Subtract(result.reg.begin(), result.reg, a.reg, a.reg.size()))
3900Decrement(result.reg.begin()+a.reg.size(), 1, modulus.reg.size()-a.reg.size());
3901
3902return result;
3903}
3904
3905Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
3906{
3907if (modulus.IsOdd())
3908{
3909MontgomeryRepresentation dr(modulus);
3910return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
3911}
3912else
3913return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
3914}
3915
3916void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
3917{
3918if (modulus.IsOdd())
3919{
3920MontgomeryRepresentation dr(modulus);
3921dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
3922for (unsigned int i=0; i<exponentsCount; i++)
3923results[i] = dr.ConvertOut(results[i]);
3924}
3925else
3926AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
3927}
3928
3929MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)// modulus must be odd
3930: ModularArithmetic(m),
3931 u((word)0, modulus.reg.size()),
3932 workspace(5*modulus.reg.size())
3933{
3934if (!modulus.IsOdd())
3935throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
3936
3937RecursiveInverseModPower2(u.reg, workspace, modulus.reg, modulus.reg.size());
3938}
3939
3940const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
3941{
3942word *const T = workspace.begin();
3943word *const R = result.reg.begin();
3944const unsigned int N = modulus.reg.size();
3945assert(a.reg.size()<=N && b.reg.size()<=N);
3946
3947AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
3948SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
3949MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
3950return result;
3951}
3952
3953const Integer& MontgomeryRepresentation::Square(const Integer &a) const
3954{
3955word *const T = workspace.begin();
3956word *const R = result.reg.begin();
3957const unsigned int N = modulus.reg.size();
3958assert(a.reg.size()<=N);
3959
3960CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
3961SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
3962MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
3963return result;
3964}
3965
3966Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
3967{
3968word *const T = workspace.begin();
3969word *const R = result.reg.begin();
3970const unsigned int N = modulus.reg.size();
3971assert(a.reg.size()<=N);
3972
3973CopyWords(T, a.reg, a.reg.size());
3974SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
3975MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
3976return result;
3977}
3978
3979const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
3980{
3981// return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
3982word *const T = workspace.begin();
3983word *const R = result.reg.begin();
3984const unsigned int N = modulus.reg.size();
3985assert(a.reg.size()<=N);
3986
3987CopyWords(T, a.reg, a.reg.size());
3988SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
3989MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
3990unsigned k = AlmostInverse(R, T, R, N, modulus.reg, N);
3991
3992//cout << "k=" << k << " N*32=" << 32*N << endl;
3993
3994if (k>N*WORD_BITS)
3995DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg, N);
3996else
3997MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg, N);
3998
3999return result;
4000}
4001
4002template class AbstractRing<Integer>;
4003
4004NAMESPACE_END

Archive Download this file

Branches

Tags