2015年1月21日星期三

Four modular inverse algorithms - magma implementation

Conclusion:
 Algorithm 2.20 = Algorithm 9.4.4  > Algorithm 2.22 > Algorithm 9.4.5
Here bigger means faster, less iteration rounds.
But for Mersenne number, still need some time to test.

// Algorithm 2.20,  from "prime number: a computational perspective" P465
// modular inverse for Mersenne number 2^q-1
SetSeed(2015);
q:=7;
p:=2^q-1;
x:=2^13+1; // Random()

a:=x; //bak
//initialize
u:=x;
v:=p;
x1:=1;
x2:=0;

k:=1;
//loop
while u ne 1 do
q:=v div u;
r:=v-q*u;
x:=x2-q*x1;

v:=u;
u:=r;
x2:=x1;
x1:=x;
        k:=k+1;
       printf "%o %o %o\n", x1, x2, u, v;
end while;

invp:= x1 mod p;
Verify:= invp*a mod p;

printf "interation=\n %o\n", k;
printf "inverse=\n %o\n", invp;
printf "x=\n %o\n", x;
printf "Verify=\n %o", Verify;

// Algorithm 2.22 Binary algorithm for inversion in Fp,  from "Guide to ECC" P41
// binary Extended Euclidean algorithm, modular inverse
SetSeed(2015);
q:=7;
p:=2^q-1;
x:=11; // Random()

//initialize
u:=x;
v:=p;
x1:=1;
x2:=0;

k:=1;
//loop
while (u ne 1) and (v ne 1) do
while u mod 2 eq 0 do
u:=u div 2;
if x1 mod 2 eq 0 then
x1:=x1 div 2;
else
x1:=x1+p;
x1:=x1 div 2;
end if;
end while;

while v mod 2 eq 0 do
v:=v div 2;
if x2 mod 2 eq 0 then
x2:=x2 div 2;
else
x2:=x2+p;
x2:=x2 div 2;
end if;
end while;

if (u gt v) or (u eq v) then
u:=u-v;
x1:=x1-x2;
else
v:=v-u;
x2:=x2-x1;
end if;

if u eq 1 then
invp:= x1 mod p;
else
invp:= x2 mod p;
end if;

        k:=k+1;
       printf "%o %o %o %o\n", u, v, x1, x2;
end while;

Verify:= invp*x mod p;

printf "interation=\n %o\n", k;
printf "inverse=\n %o\n", invp;
printf "x=\n %o\n", x;
printf "Verify=\n %o", Verify;

// Algorithm 9.4.4,  from "prime number: a computational perspective" P465
// modular inverse for Mersenne number 2^q-1

SetSeed(2015);
q:=7;
p:=2^q-1;
x:=11; // Random()

//initialize
a:=1;
z:=x mod p;

k:=1;
//loop
while z ne 1 do
q:=-(p div z);
z:=p+q*z;
a:=(q*a) mod p;

        k:=k+1;
       printf "%o %o %o\n", q, z, a;
end while;

invp:= a;
Verify:= invp*x mod p;

printf "interation=\n %o\n", k;
printf "inverse=\n %o\n", a;
printf "x=\n %o\n", x;
printf "Verify=\n %o", Verify;

// Algorithm 9.4.5,  from "prime number: a computational perspective" P466
// modular inverse for Mersenne number 2^q-1

SetSeed(2015);
q:=7;
p:=2^q-1;
x:=11; // Random()

//initialize
a:=1;
b:=0;
y:=x;
z:=p;

k:=1;
//Relational Reduction
//while k le 15 do
while k gt 0 do
 //find e
 e:=0;
 while (y mod 2) eq 0 do
  y:=y div 2;
  e:=e+1;
 end while;
 a:=a*2^(q-e) mod p;
 if y eq 1  then
  break;
 end if;

 tmp_a:=a;
 a:=a+b;
 b:=tmp_a;
 tmp_y:=y;
 y:=y+z;
 z:=tmp_y;

        k:=k+1;
       printf "%o %o %o %o\n", a, b, y, z;
end while;

invp:= a;
Verify:= invp*x mod p;

printf "interation=\n %o\n", k;
printf "inverse=\n %o\n", a;
printf "x=\n %o\n", x;
printf "Verify=\n %o", Verify;

2015年1月16日星期五

modelsim makefile

Ref: https://github.com/potentialventures/cocotb/blob/master/makefiles/simulators/Makefile.modelsim

###############################################################################
# Copyright (c) 2013 Potential Ventures Ltd
# Copyright (c) 2013 SolarFlare Communications Inc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Potential Ventures Ltd,
#       SolarFlare Communications Inc nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL POTENTIAL VENTURES LTD BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
###############################################################################

VPI_LIB := vpi

# Modelsim is 32-bit only
ARCH:=i686

ifdef VERILOG_INCLUDE_DIRS
VLOG_ARGS += +incdir+$(VERILOG_INCLUDE_DIRS)
endif

ifeq ($(GUI),1)
SIM_CMD = vsim -gui
VSIM_ARGS += -onfinish stop
else
SIM_CMD = vsim -c
VSIM_ARGS += -onfinish exit
endif

ifeq ($(GPI_IMPL),vhpi)
VSIM_ARGS += -foreign \"cocotb_init libfli.so\" -trace_foreign 3
else
VSIM_ARGS += -pli libvpi.$(LIB_EXT)
endif

$(SIM_BUILD)/runsim.do : $(VHDL_SOURCES) $(VERILOG_SOURCES) $(CUSTOM_SIM_DEPS) | $(SIM_BUILD)
echo "vlib work" > $@
ifneq ($(VERILOG_SOURCES),)
echo "vlog -timescale 1ns/100ps -mfcu +acc=rmb -sv $(VLOG_ARGS) $(VERILOG_SOURCES)" >> $@
else
echo "vcom $(VHDL_ARGS) $(VHDL_SOURCES)" >> $@
endif
echo "vsim $(VSIM_ARGS) $(TOPLEVEL)" >> $@
ifneq ($(GUI),1)
echo "run -all" >> $@
echo "quit" >> $@
endif


ifeq ($(OS),MINGW32_NT-6.1)

ifeq ($(MODELSIM_PATH),)
$(error "Need to set MODELSIM_PATH")
endif

EXTRA_LIBS := -lmtipli
EXTRA_LIBDIRS := -L$(MODELSIM_PATH)/win32aloem
OLD_PATH := $(shell echo "$(PATH)" | sed 's/(/\\(/g' | sed 's/)/\\)/g' | sed 's/ /\\ /g')
LIB_LOAD := PATH=$(OLD_PATH):$(LIB_DIR)

else
LIB_LOAD := LD_LIBRARY_PATH=$(LIB_DIR):$(LD_LIBRARY_PATH)
endif

results.xml: $(SIM_BUILD)/runsim.do $(COCOTB_LIBS) $(COCOTB_VPI_LIB)
cd $(SIM_BUILD) && $(LIB_LOAD) MODULE=$(MODULE) TESTCASE=$(TESTCASE) TOPLEVEL=$(TOPLEVEL) \
PYTHONPATH=$(LIB_DIR):$(SIM_ROOT):$(PWD):$(PYTHONPATH) \
$(SIM_CMD) -do runsim.do 2>&1 | tee sim.log

clean::
-rm -rf $(SIM_BUILD)

2015年1月15日星期四

LPN cryptosystem

http://www.ntu.edu.sg/home/rxlu/slide/20131102chenli.pdf
这个是利用LPN的难解性构建的一个系统,需要很好的随机数发生器。

2015年1月14日星期三

modelsim xilinx library

 第一次使用modelsim SE的时候,要编译库后才能够使用,一般网上有用命令行方式编译,但是xilinx为我们提供了一个GUI界面,呵呵,相当方便。

        在终端输入compxlibgui,这个就出现了编译库的界面,相当简单,如果要全部编译,一直点next即可。哈哈(但是前提要安装了xilinx ISE)。

编译后的库文件在H:/Xilinx13/ISE_DS/ISE/verilog/mti_se/6.5/nt/中,其中xilinxcorelib_ver是IP核行为级验证模型。

--------------------仿真Sram
利用Modelsim脚本编译的时候,需要加入glbl.v
#vlog D:/SVN_code/Xilinx/RSA2048/ipcore_dir/BLK_MEM_GEN_V6_1.v
vlog D:/SVN_code/Xilinx/RSA2048/ipcore_dir/sram1.v

vlog H:/Xilinx13/ISE_DS/ISE/verilog/src/glbl.v

vsim -L H:/Xilinx13/ISE_DS/ISE/verilog/mti_se/6.5/nt/xilinxcorelib_ver -novopt work.testbench glbl


当然,假如只用SRAM一个IP的话,可以只编译BLK_MEM_GEN_V6_1.v即可。
但是注意`timescale的定义,在BLK_MEM_GEN_V6_1.v中,定义的是绝对延时,比如输出数据的latency会有FLOP_DELAY个ps,所以在设计文件中的时钟频率不能太高,否则会出现多周期的延时,一般设置为`timescale 1ns/1ps


2015年1月13日星期二

verilog diary

1, use `ifdef/`else/`endif to define parameters and compiled code for different platforms
especially for srams
2, use veriloggen.pl to generate state machine

ISE using ISIM to do simulation

step:
1, turn Design/ View pannel from implementation to simulation'
2, from Processes pannel, run or rerun Simulate Behavior Model
3, to see memory content
go to inst_memory/ inst/ \nnative../ memory



“Design Summary” Section in the Map Report
Following is an example of the “Design Summary” section in the Map Report, which contains device utilization information:
Design Summary
--------------
Number of errors:      0
Number of warnings:    0
Slice Logic Utilization:
   Number of Slice Registers:                     8 out of  28,800    1%
     Number used as Flip Flops:                   8
   Number of Slice LUTs:                          7 out of  28,800    1%
     Number used as logic:                        2 out of  28,800    1%
       Number using O6 output only:               2
     Number used as Memory:                       5 out of   7,680    1%
       Number used as Shift Register:             5
         Number using O6 output only:             5

Slice Logic Distribution:
   Number of occupied Slices:                     3 out of   7,200    1% (for area count, just use this number)
     Number of occupied SLICEMs:                  2 out of   1,920    1%
     Number of occupied SLICELs:                  1 out of   5,280    1%
   Number of LUT Flip Flop pairs used:            8 (one slice can have more than 1pair of LUT+Flip Flop, so that's why this number is bigger)
     Number with an unused Flip Flop:             0 out of       8    0%
     Number with an unused LUT:                   1 out of       8   12%
     Number of fully used LUT-FF pairs:           7 out of       8   87%
     Number of unique control sets:               1
     Number of slice register sites lost 
       to control set restrictions:               3 out of  28,800    1%

A LUT Flip Flop pair for this architecture represents one LUT paired with one Flip Flop within a slice.  A control set is a unique combination of clock, reset, set, and enable signals for a registered element. The Slice Logic Distribution report is not meaningful if the design is over-mapped for a non-slice resource or if Placement fails. OVERMAPPING of BRAM resources should be ignored if the design is   over-mapped for a non-BRAM resource or if placement fails.

IO Utilization:
   Number of bonded IOBs:                         7 out of     220    3%

Specific Feature Utilization:
   Number of BUFG/BUFGCTRLs:                      1 out of      32    3%
     Number used as BUFGs:                        1

Average Fanout of Non-Clock Nets:                2.08

2015年1月9日星期五

ecc encoder

[1] PUFKY: A Fully Functional PUF-based Cryptographic Key Generator
3.3 Syndrome Generation and Error Decoding for C REP and C BCH
Repetition code C REP . The syndrome generation of x n REP consists of pairwise
XOR-ing x 1 with each remaining bit of x n REP , or h i = x 1 ⊕ x i+1 . Error decoding
is based on a Hamming weight check of the syndrome s n REP −1 , which immediately
yields the value for the first error bit e 1 . The remaining error bits are again
obtained by a pairwise XOR of e 1 with each of the syndrome bits, but this step is
discarded in the syndrome construction. In our design, both syndrome generation
and error decoding of a repetition code are fully combinatorial.
so bch(7,1,3)
encoder:
input: PUF readout {m0, m1, ..., m6} 7-bits
output:
helper data: {m1^m0, m2^m0, ..., m6^m0} 6-bits
decoder: 位于 Pufkey/source/rep_decoder.vhd
input: PUF readout {m'0, m'1, ..., m'6} 8-bit, helper data: {m1^m0, m2^m0, ..., m6^m0} 6-bits
output:
recovered: r = {m0, m0^(m1^m'1), ..., m0^(m7^m'7)} 7-bits
if weight(r) >=4
r = 0
else
r = 1

BCH code C BCH . Since BCH codes are cyclical codes, their syndrome generation
is a finite field division by the code’s generator polynomial. This is efficiently
implemented in hardware as an LFSR evaluation of length (n BCH − k BCH ).

The error decoding step of a BCH code is more complex and requires the
largest design effort of all elements in our secure sketch. Most BCH decoders are
designed with a focus on throughput and use systolic array designs, e.g. [19, 20, 22].
Aiming for a size-optimized implementation, we propose a serialized, minimalistic
coprocessor design with a 10-bit application-specific instruction set and limited
conditional execution support. Although highly optimized towards BCH decoding,
the architecture is generic in the sense that it can decode any BCH code, including
shortened versions, requiring only a slight change of firmware and memory size.
The datapath consists of two blocks: an address and a data block. To optimize
array indexing, all addressing is done indirectly using a five element address
RAM, which is efficiently updated by a dedicated address ALU. The output of the
address RAM is directly connected to the data RAM. The data block consists of
data RAM and an ALU which is used mainly for multiply-accumulate operations
over F 2 u . To minimize the size, this ALU contains only a single register. All other
necessary operands come directly from the data RAM. A high-level overview of
the coprocessor architecture is shown in Fig. 3.



[2] Implementation of BCH Code (n, k) Encoder using lfsr
输入多项式h(x) = h0 + h1*x + h2* x^2 + … hn*x^n
输入次序为 hn,... , h2, h1, h0 从左到右

对于bch(255, 171,11), n应该为254, 即一共输入255bits