--- /dev/null
+// Copyright 2017 Google Inc.\r
+//\r
+// Licensed under the Apache License, Version 2.0 (the "License");\r
+// you may not use this file except in compliance with the License.\r
+// You may obtain a copy of the License at\r
+//\r
+// http://www.apache.org/licenses/LICENSE-2.0\r
+//\r
+// Unless required by applicable law or agreed to in writing, software\r
+// distributed under the License is distributed on an "AS IS" BASIS,\r
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
+// See the License for the specific language governing permissions and\r
+// limitations under the License.\r
+//\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+package com.google.crypto.tink.subtle;\r
+\r
+import com.google.crypto.tink.annotations.Alpha;\r
+\r
+import java.util.Arrays;\r
+\r
+/**\r
+ * Defines field 25519 function based on <a\r
+ * href="https://github.com/agl/curve25519-donna/blob/master/curve25519-donna.c">curve25519-donna C\r
+ * implementation</a> (mostly identical).\r
+ *\r
+ * <p>Field elements are written as an array of signed, 64-bit limbs (an array of longs), least\r
+ * significant first. The value of the field element is:\r
+ *\r
+ * <pre>\r
+ * x[0] + 2^26·x[1] + 2^51·x[2] + 2^77·x[3] + 2^102·x[4] + 2^128·x[5] + 2^153·x[6] + 2^179·x[7] +\r
+ * 2^204·x[8] + 2^230·x[9],\r
+ * </pre>\r
+ *\r
+ * <p>i.e. the limbs are 26, 25, 26, 25, ... bits wide.\r
+ */\r
+@Alpha\r
+final class Field25519 {\r
+ /**\r
+ * During Field25519 computation, the mixed radix representation may be in different forms:\r
+ * <ul>\r
+ * <li> Reduced-size form: the array has size at most 10.\r
+ * <li> Non-reduced-size form: the array is not reduced modulo 2^255 - 19 and has size at most\r
+ * 19.\r
+ * </ul>\r
+ * <p>\r
+ * TODO(quannguyen):\r
+ * <ul>\r
+ * <li> Clarify ill-defined terminologies.\r
+ * <li> The reduction procedure is different from DJB's paper\r
+ * (http://cr.yp.to/ecdh/curve25519-20060209.pdf). The coefficients after reducing degree and\r
+ * reducing coefficients aren't guaranteed to be in range {-2^25, ..., 2^25}. We should check to\r
+ * see what's going on.\r
+ * <li> Consider using method mult() everywhere and making product() private.\r
+ * </ul>\r
+ */\r
+\r
+ static final int FIELD_LEN = 32;\r
+ static final int LIMB_CNT = 10;\r
+ private static final long TWO_TO_25 = 1 << 25;\r
+ private static final long TWO_TO_26 = TWO_TO_25 << 1;\r
+\r
+ private static final int[] EXPAND_START = {0, 3, 6, 9, 12, 16, 19, 22, 25, 28};\r
+ private static final int[] EXPAND_SHIFT = {0, 2, 3, 5, 6, 0, 1, 3, 4, 6};\r
+ private static final int[] MASK = {0x3ffffff, 0x1ffffff};\r
+ private static final int[] SHIFT = {26, 25};\r
+\r
+ /**\r
+ * Sums two numbers: output = in1 + in2\r
+ * <p>\r
+ * On entry: in1, in2 are in reduced-size form.\r
+ */\r
+ static void sum(long[] output, long[] in1, long[] in2) {\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ output[i] = in1[i] + in2[i];\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Sums two numbers: output += in\r
+ * <p>\r
+ * On entry: in is in reduced-size form.\r
+ */\r
+ static void sum(long[] output, long[] in) {\r
+ sum(output, output, in);\r
+ }\r
+\r
+ /**\r
+ * Find the difference of two numbers: output = in1 - in2\r
+ * (note the order of the arguments!).\r
+ * <p>\r
+ * On entry: in1, in2 are in reduced-size form.\r
+ */\r
+ static void sub(long[] output, long[] in1, long[] in2) {\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ output[i] = in1[i] - in2[i];\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Find the difference of two numbers: output = in - output\r
+ * (note the order of the arguments!).\r
+ * <p>\r
+ * On entry: in, output are in reduced-size form.\r
+ */\r
+ static void sub(long[] output, long[] in) {\r
+ sub(output, in, output);\r
+ }\r
+\r
+ /**\r
+ * Multiply a number by a scalar: output = in * scalar\r
+ */\r
+ static void scalarProduct(long[] output, long[] in, long scalar) {\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ output[i] = in[i] * scalar;\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Multiply two numbers: out = in2 * in\r
+ * <p>\r
+ * output must be distinct to both inputs. The inputs are reduced coefficient form,\r
+ * the output is not.\r
+ * <p>\r
+ * out[x] <= 14 * the largest product of the input limbs.\r
+ */\r
+ static void product(long[] out, long[] in2, long[] in) {\r
+ out[0] = in2[0] * in[0];\r
+ out[1] = in2[0] * in[1]\r
+ + in2[1] * in[0];\r
+ out[2] = 2 * in2[1] * in[1]\r
+ + in2[0] * in[2]\r
+ + in2[2] * in[0];\r
+ out[3] = in2[1] * in[2]\r
+ + in2[2] * in[1]\r
+ + in2[0] * in[3]\r
+ + in2[3] * in[0];\r
+ out[4] = in2[2] * in[2]\r
+ + 2 * (in2[1] * in[3] + in2[3] * in[1])\r
+ + in2[0] * in[4]\r
+ + in2[4] * in[0];\r
+ out[5] = in2[2] * in[3]\r
+ + in2[3] * in[2]\r
+ + in2[1] * in[4]\r
+ + in2[4] * in[1]\r
+ + in2[0] * in[5]\r
+ + in2[5] * in[0];\r
+ out[6] = 2 * (in2[3] * in[3] + in2[1] * in[5] + in2[5] * in[1])\r
+ + in2[2] * in[4]\r
+ + in2[4] * in[2]\r
+ + in2[0] * in[6]\r
+ + in2[6] * in[0];\r
+ out[7] = in2[3] * in[4]\r
+ + in2[4] * in[3]\r
+ + in2[2] * in[5]\r
+ + in2[5] * in[2]\r
+ + in2[1] * in[6]\r
+ + in2[6] * in[1]\r
+ + in2[0] * in[7]\r
+ + in2[7] * in[0];\r
+ out[8] = in2[4] * in[4]\r
+ + 2 * (in2[3] * in[5] + in2[5] * in[3] + in2[1] * in[7] + in2[7] * in[1])\r
+ + in2[2] * in[6]\r
+ + in2[6] * in[2]\r
+ + in2[0] * in[8]\r
+ + in2[8] * in[0];\r
+ out[9] = in2[4] * in[5]\r
+ + in2[5] * in[4]\r
+ + in2[3] * in[6]\r
+ + in2[6] * in[3]\r
+ + in2[2] * in[7]\r
+ + in2[7] * in[2]\r
+ + in2[1] * in[8]\r
+ + in2[8] * in[1]\r
+ + in2[0] * in[9]\r
+ + in2[9] * in[0];\r
+ out[10] =\r
+ 2 * (in2[5] * in[5] + in2[3] * in[7] + in2[7] * in[3] + in2[1] * in[9] + in2[9] * in[1])\r
+ + in2[4] * in[6]\r
+ + in2[6] * in[4]\r
+ + in2[2] * in[8]\r
+ + in2[8] * in[2];\r
+ out[11] = in2[5] * in[6]\r
+ + in2[6] * in[5]\r
+ + in2[4] * in[7]\r
+ + in2[7] * in[4]\r
+ + in2[3] * in[8]\r
+ + in2[8] * in[3]\r
+ + in2[2] * in[9]\r
+ + in2[9] * in[2];\r
+ out[12] = in2[6] * in[6]\r
+ + 2 * (in2[5] * in[7] + in2[7] * in[5] + in2[3] * in[9] + in2[9] * in[3])\r
+ + in2[4] * in[8]\r
+ + in2[8] * in[4];\r
+ out[13] = in2[6] * in[7]\r
+ + in2[7] * in[6]\r
+ + in2[5] * in[8]\r
+ + in2[8] * in[5]\r
+ + in2[4] * in[9]\r
+ + in2[9] * in[4];\r
+ out[14] = 2 * (in2[7] * in[7] + in2[5] * in[9] + in2[9] * in[5])\r
+ + in2[6] * in[8]\r
+ + in2[8] * in[6];\r
+ out[15] = in2[7] * in[8]\r
+ + in2[8] * in[7]\r
+ + in2[6] * in[9]\r
+ + in2[9] * in[6];\r
+ out[16] = in2[8] * in[8]\r
+ + 2 * (in2[7] * in[9] + in2[9] * in[7]);\r
+ out[17] = in2[8] * in[9]\r
+ + in2[9] * in[8];\r
+ out[18] = 2 * in2[9] * in[9];\r
+ }\r
+\r
+ /**\r
+ * Reduce a long form to a reduced-size form by taking the input mod 2^255 - 19.\r
+ * <p>\r
+ * On entry: |output[i]| < 14*2^54\r
+ * On exit: |output[0..8]| < 280*2^54\r
+ */\r
+ static void reduceSizeByModularReduction(long[] output) {\r
+ // The coefficients x[10], x[11],..., x[18] are eliminated by reduction modulo 2^255 - 19.\r
+ // For example, the coefficient x[18] is multiplied by 19 and added to the coefficient x[8].\r
+ //\r
+ // Each of these shifts and adds ends up multiplying the value by 19.\r
+ //\r
+ // For output[0..8], the absolute entry value is < 14*2^54 and we add, at most, 19*14*2^54 thus,\r
+ // on exit, |output[0..8]| < 280*2^54.\r
+ output[8] += output[18] << 4;\r
+ output[8] += output[18] << 1;\r
+ output[8] += output[18];\r
+ output[7] += output[17] << 4;\r
+ output[7] += output[17] << 1;\r
+ output[7] += output[17];\r
+ output[6] += output[16] << 4;\r
+ output[6] += output[16] << 1;\r
+ output[6] += output[16];\r
+ output[5] += output[15] << 4;\r
+ output[5] += output[15] << 1;\r
+ output[5] += output[15];\r
+ output[4] += output[14] << 4;\r
+ output[4] += output[14] << 1;\r
+ output[4] += output[14];\r
+ output[3] += output[13] << 4;\r
+ output[3] += output[13] << 1;\r
+ output[3] += output[13];\r
+ output[2] += output[12] << 4;\r
+ output[2] += output[12] << 1;\r
+ output[2] += output[12];\r
+ output[1] += output[11] << 4;\r
+ output[1] += output[11] << 1;\r
+ output[1] += output[11];\r
+ output[0] += output[10] << 4;\r
+ output[0] += output[10] << 1;\r
+ output[0] += output[10];\r
+ }\r
+\r
+ /**\r
+ * Reduce all coefficients of the short form input so that |x| < 2^26.\r
+ * <p>\r
+ * On entry: |output[i]| < 280*2^54\r
+ */\r
+ static void reduceCoefficients(long[] output) {\r
+ output[10] = 0;\r
+\r
+ for (int i = 0; i < LIMB_CNT; i += 2) {\r
+ long over = output[i] / TWO_TO_26;\r
+ // The entry condition (that |output[i]| < 280*2^54) means that over is, at most, 280*2^28 in\r
+ // the first iteration of this loop. This is added to the next limb and we can approximate the\r
+ // resulting bound of that limb by 281*2^54.\r
+ output[i] -= over << 26;\r
+ output[i + 1] += over;\r
+\r
+ // For the first iteration, |output[i+1]| < 281*2^54, thus |over| < 281*2^29. When this is\r
+ // added to the next limb, the resulting bound can be approximated as 281*2^54.\r
+ //\r
+ // For subsequent iterations of the loop, 281*2^54 remains a conservative bound and no\r
+ // overflow occurs.\r
+ over = output[i + 1] / TWO_TO_25;\r
+ output[i + 1] -= over << 25;\r
+ output[i + 2] += over;\r
+ }\r
+ // Now |output[10]| < 281*2^29 and all other coefficients are reduced.\r
+ output[0] += output[10] << 4;\r
+ output[0] += output[10] << 1;\r
+ output[0] += output[10];\r
+\r
+ output[10] = 0;\r
+ // Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 so |over| will be no more\r
+ // than 2^16.\r
+ long over = output[0] / TWO_TO_26;\r
+ output[0] -= over << 26;\r
+ output[1] += over;\r
+ // Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The bound on\r
+ // |output[1]| is sufficient to meet our needs.\r
+ }\r
+\r
+ /**\r
+ * A helpful wrapper around {@ref Field25519#product}: output = in * in2.\r
+ * <p>\r
+ * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.\r
+ * <p>\r
+ * The output is reduced degree (indeed, one need only provide storage for 10 limbs) and\r
+ * |output[i]| < 2^26.\r
+ */\r
+ static void mult(long[] output, long[] in, long[] in2) {\r
+ long[] t = new long[19];\r
+ product(t, in, in2);\r
+ // |t[i]| < 14*2^54\r
+ reduceSizeByModularReduction(t);\r
+ reduceCoefficients(t);\r
+ // |t[i]| < 2^26\r
+ System.arraycopy(t, 0, output, 0, LIMB_CNT);\r
+ }\r
+\r
+ /**\r
+ * Square a number: out = in**2\r
+ * <p>\r
+ * output must be distinct from the input. The inputs are reduced coefficient form, the output is\r
+ * not.\r
+ * <p>\r
+ * out[x] <= 14 * the largest product of the input limbs.\r
+ */\r
+ private static void squareInner(long[] out, long[] in) {\r
+ out[0] = in[0] * in[0];\r
+ out[1] = 2 * in[0] * in[1];\r
+ out[2] = 2 * (in[1] * in[1] + in[0] * in[2]);\r
+ out[3] = 2 * (in[1] * in[2] + in[0] * in[3]);\r
+ out[4] = in[2] * in[2]\r
+ + 4 * in[1] * in[3]\r
+ + 2 * in[0] * in[4];\r
+ out[5] = 2 * (in[2] * in[3] + in[1] * in[4] + in[0] * in[5]);\r
+ out[6] = 2 * (in[3] * in[3] + in[2] * in[4] + in[0] * in[6] + 2 * in[1] * in[5]);\r
+ out[7] = 2 * (in[3] * in[4] + in[2] * in[5] + in[1] * in[6] + in[0] * in[7]);\r
+ out[8] = in[4] * in[4]\r
+ + 2 * (in[2] * in[6] + in[0] * in[8] + 2 * (in[1] * in[7] + in[3] * in[5]));\r
+ out[9] = 2 * (in[4] * in[5] + in[3] * in[6] + in[2] * in[7] + in[1] * in[8] + in[0] * in[9]);\r
+ out[10] = 2 * (in[5] * in[5]\r
+ + in[4] * in[6]\r
+ + in[2] * in[8]\r
+ + 2 * (in[3] * in[7] + in[1] * in[9]));\r
+ out[11] = 2 * (in[5] * in[6] + in[4] * in[7] + in[3] * in[8] + in[2] * in[9]);\r
+ out[12] = in[6] * in[6]\r
+ + 2 * (in[4] * in[8] + 2 * (in[5] * in[7] + in[3] * in[9]));\r
+ out[13] = 2 * (in[6] * in[7] + in[5] * in[8] + in[4] * in[9]);\r
+ out[14] = 2 * (in[7] * in[7] + in[6] * in[8] + 2 * in[5] * in[9]);\r
+ out[15] = 2 * (in[7] * in[8] + in[6] * in[9]);\r
+ out[16] = in[8] * in[8] + 4 * in[7] * in[9];\r
+ out[17] = 2 * in[8] * in[9];\r
+ out[18] = 2 * in[9] * in[9];\r
+ }\r
+\r
+ /**\r
+ * Returns in^2.\r
+ * <p>\r
+ * On entry: The |in| argument is in reduced coefficients form and |in[i]| < 2^27.\r
+ * <p>\r
+ * On exit: The |output| argument is in reduced coefficients form (indeed, one need only provide\r
+ * storage for 10 limbs) and |out[i]| < 2^26.\r
+ */\r
+ static void square(long[] output, long[] in) {\r
+ long[] t = new long[19];\r
+ squareInner(t, in);\r
+ // |t[i]| < 14*2^54 because the largest product of two limbs will be < 2^(27+27) and SquareInner\r
+ // adds together, at most, 14 of those products.\r
+ reduceSizeByModularReduction(t);\r
+ reduceCoefficients(t);\r
+ // |t[i]| < 2^26\r
+ System.arraycopy(t, 0, output, 0, LIMB_CNT);\r
+ }\r
+\r
+ /**\r
+ * Takes a little-endian, 32-byte number and expands it into mixed radix form.\r
+ */\r
+ static long[] expand(byte[] input) {\r
+ long[] output = new long[LIMB_CNT];\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ output[i] = ((((long) (input[EXPAND_START[i]] & 0xff))\r
+ | ((long) (input[EXPAND_START[i] + 1] & 0xff)) << 8\r
+ | ((long) (input[EXPAND_START[i] + 2] & 0xff)) << 16\r
+ | ((long) (input[EXPAND_START[i] + 3] & 0xff)) << 24) >> EXPAND_SHIFT[i]) & MASK[i & 1];\r
+ }\r
+ return output;\r
+ }\r
+\r
+ /**\r
+ * Takes a fully reduced mixed radix form number and contract it into a little-endian, 32-byte\r
+ * array.\r
+ * <p>\r
+ * On entry: |input_limbs[i]| < 2^26\r
+ */\r
+ @SuppressWarnings("NarrowingCompoundAssignment")\r
+ static byte[] contract(long[] inputLimbs) {\r
+ long[] input = Arrays.copyOf(inputLimbs, LIMB_CNT);\r
+ for (int j = 0; j < 2; j++) {\r
+ for (int i = 0; i < 9; i++) {\r
+ // This calculation is a time-invariant way to make input[i] non-negative by borrowing\r
+ // from the next-larger limb.\r
+ int carry = -(int) ((input[i] & (input[i] >> 31)) >> SHIFT[i & 1]);\r
+ input[i] = input[i] + (carry << SHIFT[i & 1]);\r
+ input[i + 1] -= carry;\r
+ }\r
+\r
+ // There's no greater limb for input[9] to borrow from, but we can multiply by 19 and borrow\r
+ // from input[0], which is valid mod 2^255-19.\r
+ {\r
+ int carry = -(int) ((input[9] & (input[9] >> 31)) >> 25);\r
+ input[9] += (carry << 25);\r
+ input[0] -= (carry * 19);\r
+ }\r
+\r
+ // After the first iteration, input[1..9] are non-negative and fit within 25 or 26 bits,\r
+ // depending on position. However, input[0] may be negative.\r
+ }\r
+\r
+ // The first borrow-propagation pass above ended with every limb except (possibly) input[0]\r
+ // non-negative.\r
+ //\r
+ // If input[0] was negative after the first pass, then it was because of a carry from input[9].\r
+ // On entry, input[9] < 2^26 so the carry was, at most, one, since (2**26-1) >> 25 = 1. Thus\r
+ // input[0] >= -19.\r
+ //\r
+ // In the second pass, each limb is decreased by at most one. Thus the second borrow-propagation\r
+ // pass could only have wrapped around to decrease input[0] again if the first pass left\r
+ // input[0] negative *and* input[1] through input[9] were all zero. In that case, input[1] is\r
+ // now 2^25 - 1, and this last borrow-propagation step will leave input[1] non-negative.\r
+ {\r
+ int carry = -(int) ((input[0] & (input[0] >> 31)) >> 26);\r
+ input[0] += (carry << 26);\r
+ input[1] -= carry;\r
+ }\r
+\r
+ // All input[i] are now non-negative. However, there might be values between 2^25 and 2^26 in a\r
+ // limb which is, nominally, 25 bits wide.\r
+ for (int j = 0; j < 2; j++) {\r
+ for (int i = 0; i < 9; i++) {\r
+ int carry = (int) (input[i] >> SHIFT[i & 1]);\r
+ input[i] &= MASK[i & 1];\r
+ input[i + 1] += carry;\r
+ }\r
+ }\r
+\r
+ {\r
+ int carry = (int) (input[9] >> 25);\r
+ input[9] &= 0x1ffffff;\r
+ input[0] += 19 * carry;\r
+ }\r
+\r
+ // If the first carry-chain pass, just above, ended up with a carry from input[9], and that\r
+ // caused input[0] to be out-of-bounds, then input[0] was < 2^26 + 2*19, because the carry was,\r
+ // at most, two.\r
+ //\r
+ // If the second pass carried from input[9] again then input[0] is < 2*19 and the input[9] ->\r
+ // input[0] carry didn't push input[0] out of bounds.\r
+\r
+ // It still remains the case that input might be between 2^255-19 and 2^255. In this case,\r
+ // input[1..9] must take their maximum value and input[0] must be >= (2^255-19) & 0x3ffffff,\r
+ // which is 0x3ffffed.\r
+ int mask = gte((int) input[0], 0x3ffffed);\r
+ for (int i = 1; i < LIMB_CNT; i++) {\r
+ mask &= eq((int) input[i], MASK[i & 1]);\r
+ }\r
+\r
+ // mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus this conditionally\r
+ // subtracts 2^255-19.\r
+ input[0] -= mask & 0x3ffffed;\r
+ input[1] -= mask & 0x1ffffff;\r
+ for (int i = 2; i < LIMB_CNT; i += 2) {\r
+ input[i] -= mask & 0x3ffffff;\r
+ input[i + 1] -= mask & 0x1ffffff;\r
+ }\r
+\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ input[i] <<= EXPAND_SHIFT[i];\r
+ }\r
+ byte[] output = new byte[FIELD_LEN];\r
+ for (int i = 0; i < LIMB_CNT; i++) {\r
+ output[EXPAND_START[i]] |= input[i] & 0xff;\r
+ output[EXPAND_START[i] + 1] |= (input[i] >> 8) & 0xff;\r
+ output[EXPAND_START[i] + 2] |= (input[i] >> 16) & 0xff;\r
+ output[EXPAND_START[i] + 3] |= (input[i] >> 24) & 0xff;\r
+ }\r
+ return output;\r
+ }\r
+\r
+ /**\r
+ * Computes inverse of z = z(2^255 - 21)\r
+ * <p>\r
+ * Shamelessly copied from agl's code which was shamelessly copied from djb's code. Only the\r
+ * comment format and the variable namings are different from those.\r
+ */\r
+ static void inverse(long[] out, long[] z) {\r
+ long[] z2 = new long[Field25519.LIMB_CNT];\r
+ long[] z9 = new long[Field25519.LIMB_CNT];\r
+ long[] z11 = new long[Field25519.LIMB_CNT];\r
+ long[] z2To5Minus1 = new long[Field25519.LIMB_CNT];\r
+ long[] z2To10Minus1 = new long[Field25519.LIMB_CNT];\r
+ long[] z2To20Minus1 = new long[Field25519.LIMB_CNT];\r
+ long[] z2To50Minus1 = new long[Field25519.LIMB_CNT];\r
+ long[] z2To100Minus1 = new long[Field25519.LIMB_CNT];\r
+ long[] t0 = new long[Field25519.LIMB_CNT];\r
+ long[] t1 = new long[Field25519.LIMB_CNT];\r
+\r
+ square(z2, z); // 2\r
+ square(t1, z2); // 4\r
+ square(t0, t1); // 8\r
+ mult(z9, t0, z); // 9\r
+ mult(z11, z9, z2); // 11\r
+ square(t0, z11); // 22\r
+ mult(z2To5Minus1, t0, z9); // 2^5 - 2^0 = 31\r
+\r
+ square(t0, z2To5Minus1); // 2^6 - 2^1\r
+ square(t1, t0); // 2^7 - 2^2\r
+ square(t0, t1); // 2^8 - 2^3\r
+ square(t1, t0); // 2^9 - 2^4\r
+ square(t0, t1); // 2^10 - 2^5\r
+ mult(z2To10Minus1, t0, z2To5Minus1); // 2^10 - 2^0\r
+\r
+ square(t0, z2To10Minus1); // 2^11 - 2^1\r
+ square(t1, t0); // 2^12 - 2^2\r
+ for (int i = 2; i < 10; i += 2) { // 2^20 - 2^10\r
+ square(t0, t1);\r
+ square(t1, t0);\r
+ }\r
+ mult(z2To20Minus1, t1, z2To10Minus1); // 2^20 - 2^0\r
+\r
+ square(t0, z2To20Minus1); // 2^21 - 2^1\r
+ square(t1, t0); // 2^22 - 2^2\r
+ for (int i = 2; i < 20; i += 2) { // 2^40 - 2^20\r
+ square(t0, t1);\r
+ square(t1, t0);\r
+ }\r
+ mult(t0, t1, z2To20Minus1); // 2^40 - 2^0\r
+\r
+ square(t1, t0); // 2^41 - 2^1\r
+ square(t0, t1); // 2^42 - 2^2\r
+ for (int i = 2; i < 10; i += 2) { // 2^50 - 2^10\r
+ square(t1, t0);\r
+ square(t0, t1);\r
+ }\r
+ mult(z2To50Minus1, t0, z2To10Minus1); // 2^50 - 2^0\r
+\r
+ square(t0, z2To50Minus1); // 2^51 - 2^1\r
+ square(t1, t0); // 2^52 - 2^2\r
+ for (int i = 2; i < 50; i += 2) { // 2^100 - 2^50\r
+ square(t0, t1);\r
+ square(t1, t0);\r
+ }\r
+ mult(z2To100Minus1, t1, z2To50Minus1); // 2^100 - 2^0\r
+\r
+ square(t1, z2To100Minus1); // 2^101 - 2^1\r
+ square(t0, t1); // 2^102 - 2^2\r
+ for (int i = 2; i < 100; i += 2) { // 2^200 - 2^100\r
+ square(t1, t0);\r
+ square(t0, t1);\r
+ }\r
+ mult(t1, t0, z2To100Minus1); // 2^200 - 2^0\r
+\r
+ square(t0, t1); // 2^201 - 2^1\r
+ square(t1, t0); // 2^202 - 2^2\r
+ for (int i = 2; i < 50; i += 2) { // 2^250 - 2^50\r
+ square(t0, t1);\r
+ square(t1, t0);\r
+ }\r
+ mult(t0, t1, z2To50Minus1); // 2^250 - 2^0\r
+\r
+ square(t1, t0); // 2^251 - 2^1\r
+ square(t0, t1); // 2^252 - 2^2\r
+ square(t1, t0); // 2^253 - 2^3\r
+ square(t0, t1); // 2^254 - 2^4\r
+ square(t1, t0); // 2^255 - 2^5\r
+ mult(out, t1, z11); // 2^255 - 21\r
+ }\r
+\r
+\r
+ /**\r
+ * Returns 0xffffffff iff a == b and zero otherwise.\r
+ */\r
+ private static int eq(int a, int b) {\r
+ a = ~(a ^ b);\r
+ a &= a << 16;\r
+ a &= a << 8;\r
+ a &= a << 4;\r
+ a &= a << 2;\r
+ a &= a << 1;\r
+ return a >> 31;\r
+ }\r
+\r
+ /**\r
+ * returns 0xffffffff if a >= b and zero otherwise, where a and b are both non-negative.\r
+ */\r
+ private static int gte(int a, int b) {\r
+ a -= b;\r
+ // a >= 0 iff a >= b.\r
+ return ~(a >> 31);\r
+ }\r
+}\r