Znaci sekvencijalni quick sort liste je brzi od merge sorta ;P
Ovo je neki moj quick sort koji je drasticno brzi od C++ (gcc) sorta liste:
Code:
#ifndef VLIBQSORT_H
#define VLIBQSORT_H
#include <pthread.h>
#include <algorithm>
#include <functional>
namespace VLib{
using std::swap;
template <typename T, typename F>
inline T median_of_three(T a, T b, T c, F f)
{
if (f(*a , *b))
if (f(*b , *c))
return b;
else if (f(*a , *c))
return c;
else
return a;
else if (f(*a , *c))
return a;
else if (f(*b , *c))
return c;
else
return b;
}
template <typename T, typename F>
void insertion_sort(T begin, T end, F f)
{
if(begin == end)return;
T i = begin;
++i;
for(;i!=end;++i)
{
typeof(*i) v = *i;
T j = i;
--j;
while(f(v,*j))
{
T k = j;
++k;
*k = *j;
if(j == begin){--j;break;}
--j;
}
++j;
*j = v;
}
}
template <typename T,typename F>
void qsort(T begin, T end, unsigned size, int t, F f)
{
struct Data{
T i, j; unsigned k,t; F f;
static void* qs(void* d)
{
Data* t = (Data*)d;
VLib::qsort(t->i,t->j,t->k,t->t-1,t->f);
return 0;
}
};
class Thread{
public:
typedef void* (*f_t)(void*);
Thread(f_t f,void* p)
: f_(f),data_(p)
{
pthread_attr_init(&attr_);
pthread_attr_setstacksize(&attr_,256*1024);
}
void start()
{
pthread_create(&tid_,&attr_,f_,data_);
}
void join()
{
pthread_join(tid_,0);
}
~Thread()
{
pthread_attr_destroy(&attr_);
join();
}
private:
pthread_t tid_;
pthread_attr_t attr_;
f_t f_;
void* data_;
};
if(begin == end)return;
T high = end, low = begin;
if(!size)
{
T tmp = begin;
while(tmp!=end){ high=tmp++;++size; }
}
else --high;
if(size == 1)return;
if(size <= 16)
{
insertion_sort(begin,end,f);
return;
}
unsigned count = 0;
T it = begin;
while(++count<size/2)++it;
it = median_of_three(begin,it,high,f);
typeof(*it) pivot = *it;
unsigned counthigh = 0,countlow = 0;
do
{
while(high != low && f(pivot,*high)){ --high;++counthigh; }
while(low != high && f(*low,pivot)){ ++low;++countlow; }
if(low != high && !f(*low,pivot) && !f(pivot,*low) && !f(pivot,*high) && !f(*high,pivot))
{
while(high != low && !f(*high,*low) && !f(*low,*high)){ --high;++counthigh; }
}
swap(*low,*high);
}while(low != high);
T i = low;
while(++i != end && !f(*i,*low) && !f(*low,*i) )--counthigh;
if(t>0 && size > 1000)
{
Data d1 = {begin,low,countlow,t,f}, d2 = {i,end,counthigh,t,f};
Thread t1(Data::qs,&d1),t2(Data::qs,&d2);
t1.start();
t2.start();
}
else
{
VLib::qsort(begin,low,countlow,0,f);
VLib::qsort(i,end,counthigh,0,f);
}
}
template <typename T>
inline void qsort(T begin, T end, unsigned size = 0, int t = 2)
{
VLib::qsort(begin,end,size,t,std::less<typename std::iterator_traits<T>::value_type>());
}
}
#endif
#ifndef VLIBQSORT_H
#define VLIBQSORT_H
#include <pthread.h>
#include <algorithm>
#include <functional>
namespace VLib{
using std::swap;
template <typename T, typename F>
inline T median_of_three(T a, T b, T c, F f)
{
if (f(*a , *b))
if (f(*b , *c))
return b;
else if (f(*a , *c))
return c;
else
return a;
else if (f(*a , *c))
return a;
else if (f(*b , *c))
return c;
else
return b;
}
template <typename T, typename F>
void insertion_sort(T begin, T end, F f)
{
if(begin == end)return;
T i = begin;
++i;
for(;i!=end;++i)
{
typeof(*i) v = *i;
T j = i;
--j;
while(f(v,*j))
{
T k = j;
++k;
*k = *j;
if(j == begin){--j;break;}
--j;
}
++j;
*j = v;
}
}
template <typename T,typename F>
void qsort(T begin, T end, unsigned size, int t, F f)
{
struct Data{
T i, j; unsigned k,t; F f;
static void* qs(void* d)
{
Data* t = (Data*)d;
VLib::qsort(t->i,t->j,t->k,t->t-1,t->f);
return 0;
}
};
class Thread{
public:
typedef void* (*f_t)(void*);
Thread(f_t f,void* p)
: f_(f),data_(p)
{
pthread_attr_init(&attr_);
pthread_attr_setstacksize(&attr_,256*1024);
}
void start()
{
pthread_create(&tid_,&attr_,f_,data_);
}
void join()
{
pthread_join(tid_,0);
}
~Thread()
{
pthread_attr_destroy(&attr_);
join();
}
private:
pthread_t tid_;
pthread_attr_t attr_;
f_t f_;
void* data_;
};
if(begin == end)return;
T high = end, low = begin;
if(!size)
{
T tmp = begin;
while(tmp!=end){ high=tmp++;++size; }
}
else --high;
if(size == 1)return;
if(size <= 16)
{
insertion_sort(begin,end,f);
return;
}
unsigned count = 0;
T it = begin;
while(++count<size/2)++it;
it = median_of_three(begin,it,high,f);
typeof(*it) pivot = *it;
unsigned counthigh = 0,countlow = 0;
do
{
while(high != low && f(pivot,*high)){ --high;++counthigh; }
while(low != high && f(*low,pivot)){ ++low;++countlow; }
if(low != high && !f(*low,pivot) && !f(pivot,*low) && !f(pivot,*high) && !f(*high,pivot))
{
while(high != low && !f(*high,*low) && !f(*low,*high)){ --high;++counthigh; }
}
swap(*low,*high);
}while(low != high);
T i = low;
while(++i != end && !f(*i,*low) && !f(*low,*i) )--counthigh;
if(t>0 && size > 1000)
{
Data d1 = {begin,low,countlow,t,f}, d2 = {i,end,counthigh,t,f};
Thread t1(Data::qs,&d1),t2(Data::qs,&d2);
t1.start();
t2.start();
}
else
{
VLib::qsort(begin,low,countlow,0,f);
VLib::qsort(i,end,counthigh,0,f);
}
}
template <typename T>
inline void qsort(T begin, T end, unsigned size = 0, int t = 2)
{
VLib::qsort(begin,end,size,t,std::less<typename std::iterator_traits<T>::value_type>());
}
}
#endif
rezultat:
Code:
~/.../examples/sort >>> ./bench
radix asm sort 536.088
asm bitonic sort 3226
C++ sort 1201.64
VLib sort 1737.32
C qsort 2548.3
C++ list sort 9933.06
VLib list sort 1847.04
all set
~/.../examples/sort >>> ./bench
radix asm sort 536.088
asm bitonic sort 3226
C++ sort 1201.64
VLib sort 1737.32
C qsort 2548.3
C++ list sort 9933.06
VLib list sort 1847.04
all set
Znaci moj sort je 5* sic brzi od std C++ sorta dok je naravno sortiranje niza sporije.
Pazite sad asembler merge sort vs asembler radix sorta liste protiv radix sorta niza:
Code:
~/.../examples/sort >>> cat list_sort.asm
format elf64 executable 3 at 0x100000000
struc Node {
.next dq ?
.data dd ?
.size = $-.next
}
virtual at 0
n Node
end virtual
include 'import64.inc'
interpreter '/lib64/ld-linux-x86-64.so.2'
needed 'libc.so.6','./speedupavx2.so'
import atoi,printf,puts,exit,rand_r,time,malloc, \
radix_sort_avx2
segment executable
entry $
mov [N],4096*4096
cmp [rsp],dword 1
je .skip
mov rdi,[rsp+16]
call [atoi]
cmp eax,1
jl .exit
mov [N],eax
.skip:
xor edi,edi
call [time]
mov [seed],eax ; seed on time
mov rdi,fmt1
mov esi,[seed]
xor eax,eax
call [printf] ; print seed
mov rdi,fmt4
mov esi,[N]
xor eax,eax
call [printf]
mov rbx,list1
push rbx
call init_list
mov rbx,list2
push rbx
call init_list
add rsp,16
mov edi,[N]
imul rdi,4
call [malloc]
mov [array],rax
mov rdi,[list2]
mov rsi,[list1]
call assign_list
mov rdi,[array]
mov rsi,[list1]
call assign_array
mov rdi,fmtu
mov rbx,list1
push rbx
call print_list
add rsp,8
radix_avx2 equ 1
if defined radix_avx2
call init_time
mov rdi,[array]
mov esi,[N]
call [radix_sort_avx2]
mov rdi,fmtar
call time_me
end if
if 1
call init_time
mov rbx,[list1]
push rbx
call radix_sort
pop rbx
mov [list1],rbx
mov rdi,fmtr
call time_me
end if
if 1
call init_time
mov rbx,[list2]
push rbx
call sort
pop rbx
mov [list2],rbx
mov rdi,fmtm
call time_me
end if
mov rdi,[list1]
mov rsi,[list2]
call equal_list
mov rdi,fmte
test eax,eax
mov rbx,fmtne
cmovz rdi,rbx
call [puts]
mov rdi,[array]
mov rsi,[list2]
call equal_array
mov rdi,fmte
test eax,eax
mov rbx,fmtne
cmovz rdi,rbx
call [puts]
mov rdi,fmts
mov rbx,list1
push rbx
call print_list
add rsp,8
mov rdi,[list1]
call length
mov rdx,rcx
mov rdi,fmt3
mov rsi,n.size
xor eax,eax
call [printf]
xor edi,edi
.exit:
call [exit]
init_list:
mov ebx,[N]
mov r12,[rsp+8]
.L0:
mov edi,n.size
call [malloc]
mov rcx,[r12]
mov [rax+n.next],rcx
mov [r12],rax
mov rdi,seed
if 1
call [rand_r]
else
rdrand eax
end if
xor edx,edx
mov ecx,[N]
div ecx
mov rcx,[r12]
mov [rcx+n.data],edx
dec ebx
jnz .L0
ret
print_list:
call [puts]
mov rbx,[rsp+8]
mov rbx,[rbx]
mov r12,16
.L0:
test rbx,rbx
jz .exit
mov rdi,fmt2
mov rsi,[rbx+n.next]
mov edx,[rbx+n.data]
xor eax,eax
call [printf]
mov rbx,[rbx+n.next]
dec r12
jz .exit
jmp .L0
.exit:
ret
; rsi source, rdi dest
assign_list:
.L0:
test rsi,rsi
jz .exit
test rdi,rdi
jz .exit
mov eax,[rsi+n.data]
mov [rdi+n.data],eax
mov rsi,[rsi+n.next]
mov rdi,[rdi+n.next]
jmp .L0
.exit:
ret
assign_array:
.L0:
test rsi,rsi
jz .exit
mov eax,[rsi+n.data]
mov [rdi],eax
mov rsi,[rsi+n.next]
add rdi,4
jmp .L0
.exit:
ret
equal_list:
mov eax,1
.L0:
test rsi,rsi
jz .exit
test rdi,rdi
jz .exit
mov eax,[rsi+n.data]
cmp [rdi+n.data],eax
jnz .false
mov rsi,[rsi+n.next]
mov rdi,[rdi+n.next]
jmp .L0
.exit:
ret
.false:
xor eax,eax
ret
equal_array:
mov eax,1
.L0:
test rsi,rsi
jz .exit
mov eax,[rsi+n.data]
cmp [rdi],eax
jnz .false
mov rsi,[rsi+n.next]
add rdi,4
jmp .L0
.exit:
ret
.false:
xor eax,eax
ret
; [rsp+8] list to sort
sort:
mov rdi,[rsp+8]
call length
cmp rcx,1
jle .exit
shr rcx,1 ; middle
sub rsp,16 ; left,right
mov qword[rsp],0
mov qword[rsp+8],0
mov rbx,[rsp+8+16]
.L0: ; append to left
mov rax,[rsp]
mov rdx,[rbx+n.next]
mov [rbx+n.next],rax
mov [rsp],rbx
mov rbx,rdx
dec rcx
jnz .L0
.L1: ; append to right
mov rax,[rsp+8]
mov rdx,[rbx+n.next]
mov [rbx+n.next],rax
mov [rsp+8],rbx
mov rbx,[rbx+n.next]
mov rbx,rdx
test rbx,rbx
jnz .L1
sub rsp,8 ; result
mov rbx,[rsp+8]
mov [rsp],rbx
call sort
mov rbx,[rsp]
mov [rsp+8],rbx
mov rbx,[rsp+16]
mov [rsp],rbx
call sort
mov rbx,[rsp]
mov [rsp+16],rbx
call merge
mov rbx,[rsp]
add rsp,24
mov [rsp+8],rbx
.exit:
ret
; [rsp+8] output , [rsp+16] left, [rsp+24] right
merge:
sub rsp,8 ; append position
mov qword[rsp+16],0
mov qword[rsp],0
.L0:
cmp qword[rsp+24],0
jz .right
cmp qword[rsp+32],0
jz .left
mov rax,[rsp+24]
mov ebx,[rax+n.data]
mov rcx,[rsp+32]
cmp ebx,[rcx+n.data]
jl .add_left
.add_right:
cmp qword[rsp],0
je .just_set_right
mov rdx,[rsp]
mov [rdx+n.next],rcx
mov rdx,[rcx+n.next]
mov [rsp+32],rdx
mov qword[rcx+n.next],0
mov [rsp],rcx
jmp .L0
.add_left:
cmp qword[rsp],0
je .just_set_left
mov rdx,[rsp]
mov [rdx+n.next],rax
mov rdx,[rax+n.next]
mov [rsp+24],rdx
mov qword[rax+n.next],0
mov [rsp],rax
jmp .L0
.just_set_left:
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp],rax
mov [rsp+16],rax
mov [rsp+24],rdx
jmp .L0
.just_set_right:
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp],rcx
mov [rsp+16],rcx
mov [rsp+32],rdx
jmp .L0
.right:
cmp qword[rsp+32],0
jz .exit
mov rcx,[rsp+32]
cmp qword[rsp],0
je .just_set_right_only
mov rdx,[rsp]
mov [rdx+n.next],rcx
mov [rsp],rcx
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp+32],rdx
jmp .right
.just_set_right_only:
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp],rcx
mov [rsp+16],rcx
mov [rsp+32],rdx
jmp .right
.left:
cmp qword[rsp+24],0
jz .exit
mov rax,[rsp+24]
cmp qword[rsp],0
je .just_set_left_only
mov rdx,[rsp]
mov [rdx+n.next],rax
mov [rsp],rax
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp+24],rdx
jmp .left
.just_set_left_only:
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp],rax
mov [rsp+24],rax
mov [rsp+32],rdx
jmp .left
.exit:
add rsp,8
ret
; rdi input list, rcx count
length:
mov rcx,0
.L0:
test rdi,rdi
jz .exit
mov rdi,[rdi+n.next]
inc rcx
jmp .L0
.exit:
ret
; [rsp+8] list to sort
radix_sort:
sub rsp,32*8
xor ebx,ebx
.L0:
mov rdi,rsp
mov rcx,32
xor eax,eax
rep stosq
mov rdi,[rsp +32*8+8]
.L1:
test rdi,rdi
jz .next
mov ecx,ebx
shl ecx,2
mov esi,[rdi+n.data]
shr esi,cl
and esi,0fh
cmp qword[rsp+rsi*8],0
je .just_set
mov rdx,[rdi+n.next]
mov rax,[rsp+rsi*8+16*8]
mov [rax+n.next],rdi
mov [rsp+rsi*8+16*8],rdi
mov qword[rdi+n.next],0
mov rdi,rdx
jmp .L1
.just_set:
mov rdx,[rdi+n.next]
mov [rsp+rsi*8],rdi
mov [rsp+rsi*8+16*8],rdi
mov qword[rdi+n.next],0
mov rdi,rdx
jmp .L1
.next:
mov qword[rsp+32*8+8],0
xor edi,edi
xor ecx,ecx
.L2:
mov rsi,[rsp+rcx*8]
.L3:
test rsi,rsi
jz .next1
test rdi,rdi
jz .just_set1
mov [rdi+n.next],rsi
mov rdi,rsi
mov rdx,[rsi+n.next]
mov qword[rsi+n.next],0
mov rsi,rdx
jmp .L3
.just_set1:
mov rdi,rsi
mov [rsp+32*8+8],rsi
mov rdx,[rsi+n.next]
mov qword[rsi+n.next],0
mov rsi,rdx
jmp .L3
.next1:
inc ecx
cmp ecx,16
jl .L2
inc rbx
cmp rbx,8
jl .L0
.exit:
add rsp,32*8
ret
init_time:
rdtscp
shl rdx,32
or rax,rdx
mov [elapsed],rax
ret
time_me:
rdtscp
shl rdx,32
or rax,rdx
sub rax,[elapsed]
cvtsi2sd xmm0,rax
divsd xmm0,[clock]
mov rax,1
jmp [printf]
segment readable
clock dq 3.8e9
fmtu db 'unsorted',0ah,0
fmts db 'sorted' ,0ah,0
fmte db 'equal',0ah,0
fmtne db 'not equal',0ah,0
fmtm db 'list merge elapsed %f seconds',0ah,0
fmtr db 'list radix elapsed %f seconds',0ah,0
fmtar db 'array radix elapsed %f seconds',0ah,0
fmt1 db 'seed: %d',0ah,0
fmt2 db '%16p %d',0ah,0
fmt3 db 'size of node %d, length %d',0ah,0
fmt4 db "N: %d",0xa,0
segment writeable
list1 dq 0
list2 dq 0
array rq 1
elapsed rq 1
N rd 1; number of nodes
seed rd 1
;seed rd 1
~/.../examples/sort >>> cat list_sort.asm
format elf64 executable 3 at 0x100000000
struc Node {
.next dq ?
.data dd ?
.size = $-.next
}
virtual at 0
n Node
end virtual
include 'import64.inc'
interpreter '/lib64/ld-linux-x86-64.so.2'
needed 'libc.so.6','./speedupavx2.so'
import atoi,printf,puts,exit,rand_r,time,malloc, \
radix_sort_avx2
segment executable
entry $
mov [N],4096*4096
cmp [rsp],dword 1
je .skip
mov rdi,[rsp+16]
call [atoi]
cmp eax,1
jl .exit
mov [N],eax
.skip:
xor edi,edi
call [time]
mov [seed],eax ; seed on time
mov rdi,fmt1
mov esi,[seed]
xor eax,eax
call [printf] ; print seed
mov rdi,fmt4
mov esi,[N]
xor eax,eax
call [printf]
mov rbx,list1
push rbx
call init_list
mov rbx,list2
push rbx
call init_list
add rsp,16
mov edi,[N]
imul rdi,4
call [malloc]
mov [array],rax
mov rdi,[list2]
mov rsi,[list1]
call assign_list
mov rdi,[array]
mov rsi,[list1]
call assign_array
mov rdi,fmtu
mov rbx,list1
push rbx
call print_list
add rsp,8
radix_avx2 equ 1
if defined radix_avx2
call init_time
mov rdi,[array]
mov esi,[N]
call [radix_sort_avx2]
mov rdi,fmtar
call time_me
end if
if 1
call init_time
mov rbx,[list1]
push rbx
call radix_sort
pop rbx
mov [list1],rbx
mov rdi,fmtr
call time_me
end if
if 1
call init_time
mov rbx,[list2]
push rbx
call sort
pop rbx
mov [list2],rbx
mov rdi,fmtm
call time_me
end if
mov rdi,[list1]
mov rsi,[list2]
call equal_list
mov rdi,fmte
test eax,eax
mov rbx,fmtne
cmovz rdi,rbx
call [puts]
mov rdi,[array]
mov rsi,[list2]
call equal_array
mov rdi,fmte
test eax,eax
mov rbx,fmtne
cmovz rdi,rbx
call [puts]
mov rdi,fmts
mov rbx,list1
push rbx
call print_list
add rsp,8
mov rdi,[list1]
call length
mov rdx,rcx
mov rdi,fmt3
mov rsi,n.size
xor eax,eax
call [printf]
xor edi,edi
.exit:
call [exit]
init_list:
mov ebx,[N]
mov r12,[rsp+8]
.L0:
mov edi,n.size
call [malloc]
mov rcx,[r12]
mov [rax+n.next],rcx
mov [r12],rax
mov rdi,seed
if 1
call [rand_r]
else
rdrand eax
end if
xor edx,edx
mov ecx,[N]
div ecx
mov rcx,[r12]
mov [rcx+n.data],edx
dec ebx
jnz .L0
ret
print_list:
call [puts]
mov rbx,[rsp+8]
mov rbx,[rbx]
mov r12,16
.L0:
test rbx,rbx
jz .exit
mov rdi,fmt2
mov rsi,[rbx+n.next]
mov edx,[rbx+n.data]
xor eax,eax
call [printf]
mov rbx,[rbx+n.next]
dec r12
jz .exit
jmp .L0
.exit:
ret
; rsi source, rdi dest
assign_list:
.L0:
test rsi,rsi
jz .exit
test rdi,rdi
jz .exit
mov eax,[rsi+n.data]
mov [rdi+n.data],eax
mov rsi,[rsi+n.next]
mov rdi,[rdi+n.next]
jmp .L0
.exit:
ret
assign_array:
.L0:
test rsi,rsi
jz .exit
mov eax,[rsi+n.data]
mov [rdi],eax
mov rsi,[rsi+n.next]
add rdi,4
jmp .L0
.exit:
ret
equal_list:
mov eax,1
.L0:
test rsi,rsi
jz .exit
test rdi,rdi
jz .exit
mov eax,[rsi+n.data]
cmp [rdi+n.data],eax
jnz .false
mov rsi,[rsi+n.next]
mov rdi,[rdi+n.next]
jmp .L0
.exit:
ret
.false:
xor eax,eax
ret
equal_array:
mov eax,1
.L0:
test rsi,rsi
jz .exit
mov eax,[rsi+n.data]
cmp [rdi],eax
jnz .false
mov rsi,[rsi+n.next]
add rdi,4
jmp .L0
.exit:
ret
.false:
xor eax,eax
ret
; [rsp+8] list to sort
sort:
mov rdi,[rsp+8]
call length
cmp rcx,1
jle .exit
shr rcx,1 ; middle
sub rsp,16 ; left,right
mov qword[rsp],0
mov qword[rsp+8],0
mov rbx,[rsp+8+16]
.L0: ; append to left
mov rax,[rsp]
mov rdx,[rbx+n.next]
mov [rbx+n.next],rax
mov [rsp],rbx
mov rbx,rdx
dec rcx
jnz .L0
.L1: ; append to right
mov rax,[rsp+8]
mov rdx,[rbx+n.next]
mov [rbx+n.next],rax
mov [rsp+8],rbx
mov rbx,[rbx+n.next]
mov rbx,rdx
test rbx,rbx
jnz .L1
sub rsp,8 ; result
mov rbx,[rsp+8]
mov [rsp],rbx
call sort
mov rbx,[rsp]
mov [rsp+8],rbx
mov rbx,[rsp+16]
mov [rsp],rbx
call sort
mov rbx,[rsp]
mov [rsp+16],rbx
call merge
mov rbx,[rsp]
add rsp,24
mov [rsp+8],rbx
.exit:
ret
; [rsp+8] output , [rsp+16] left, [rsp+24] right
merge:
sub rsp,8 ; append position
mov qword[rsp+16],0
mov qword[rsp],0
.L0:
cmp qword[rsp+24],0
jz .right
cmp qword[rsp+32],0
jz .left
mov rax,[rsp+24]
mov ebx,[rax+n.data]
mov rcx,[rsp+32]
cmp ebx,[rcx+n.data]
jl .add_left
.add_right:
cmp qword[rsp],0
je .just_set_right
mov rdx,[rsp]
mov [rdx+n.next],rcx
mov rdx,[rcx+n.next]
mov [rsp+32],rdx
mov qword[rcx+n.next],0
mov [rsp],rcx
jmp .L0
.add_left:
cmp qword[rsp],0
je .just_set_left
mov rdx,[rsp]
mov [rdx+n.next],rax
mov rdx,[rax+n.next]
mov [rsp+24],rdx
mov qword[rax+n.next],0
mov [rsp],rax
jmp .L0
.just_set_left:
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp],rax
mov [rsp+16],rax
mov [rsp+24],rdx
jmp .L0
.just_set_right:
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp],rcx
mov [rsp+16],rcx
mov [rsp+32],rdx
jmp .L0
.right:
cmp qword[rsp+32],0
jz .exit
mov rcx,[rsp+32]
cmp qword[rsp],0
je .just_set_right_only
mov rdx,[rsp]
mov [rdx+n.next],rcx
mov [rsp],rcx
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp+32],rdx
jmp .right
.just_set_right_only:
mov rdx,[rcx+n.next]
mov qword[rcx+n.next],0
mov [rsp],rcx
mov [rsp+16],rcx
mov [rsp+32],rdx
jmp .right
.left:
cmp qword[rsp+24],0
jz .exit
mov rax,[rsp+24]
cmp qword[rsp],0
je .just_set_left_only
mov rdx,[rsp]
mov [rdx+n.next],rax
mov [rsp],rax
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp+24],rdx
jmp .left
.just_set_left_only:
mov rdx,[rax+n.next]
mov qword[rax+n.next],0
mov [rsp],rax
mov [rsp+24],rax
mov [rsp+32],rdx
jmp .left
.exit:
add rsp,8
ret
; rdi input list, rcx count
length:
mov rcx,0
.L0:
test rdi,rdi
jz .exit
mov rdi,[rdi+n.next]
inc rcx
jmp .L0
.exit:
ret
; [rsp+8] list to sort
radix_sort:
sub rsp,32*8
xor ebx,ebx
.L0:
mov rdi,rsp
mov rcx,32
xor eax,eax
rep stosq
mov rdi,[rsp +32*8+8]
.L1:
test rdi,rdi
jz .next
mov ecx,ebx
shl ecx,2
mov esi,[rdi+n.data]
shr esi,cl
and esi,0fh
cmp qword[rsp+rsi*8],0
je .just_set
mov rdx,[rdi+n.next]
mov rax,[rsp+rsi*8+16*8]
mov [rax+n.next],rdi
mov [rsp+rsi*8+16*8],rdi
mov qword[rdi+n.next],0
mov rdi,rdx
jmp .L1
.just_set:
mov rdx,[rdi+n.next]
mov [rsp+rsi*8],rdi
mov [rsp+rsi*8+16*8],rdi
mov qword[rdi+n.next],0
mov rdi,rdx
jmp .L1
.next:
mov qword[rsp+32*8+8],0
xor edi,edi
xor ecx,ecx
.L2:
mov rsi,[rsp+rcx*8]
.L3:
test rsi,rsi
jz .next1
test rdi,rdi
jz .just_set1
mov [rdi+n.next],rsi
mov rdi,rsi
mov rdx,[rsi+n.next]
mov qword[rsi+n.next],0
mov rsi,rdx
jmp .L3
.just_set1:
mov rdi,rsi
mov [rsp+32*8+8],rsi
mov rdx,[rsi+n.next]
mov qword[rsi+n.next],0
mov rsi,rdx
jmp .L3
.next1:
inc ecx
cmp ecx,16
jl .L2
inc rbx
cmp rbx,8
jl .L0
.exit:
add rsp,32*8
ret
init_time:
rdtscp
shl rdx,32
or rax,rdx
mov [elapsed],rax
ret
time_me:
rdtscp
shl rdx,32
or rax,rdx
sub rax,[elapsed]
cvtsi2sd xmm0,rax
divsd xmm0,[clock]
mov rax,1
jmp [printf]
segment readable
clock dq 3.8e9
fmtu db 'unsorted',0ah,0
fmts db 'sorted' ,0ah,0
fmte db 'equal',0ah,0
fmtne db 'not equal',0ah,0
fmtm db 'list merge elapsed %f seconds',0ah,0
fmtr db 'list radix elapsed %f seconds',0ah,0
fmtar db 'array radix elapsed %f seconds',0ah,0
fmt1 db 'seed: %d',0ah,0
fmt2 db '%16p %d',0ah,0
fmt3 db 'size of node %d, length %d',0ah,0
fmt4 db "N: %d",0xa,0
segment writeable
list1 dq 0
list2 dq 0
array rq 1
elapsed rq 1
N rd 1; number of nodes
seed rd 1
;seed rd 1
i radix asm sort:
Code:
align 16
; rdi array,rsi size
radix_sort_avx2:
push rbx
push rbp
mov rbp,rsp
sub rsp,16+16*8+16*8
mov [rbp-8],rdi
mov [rbp-16],rsi
xor rbx,rbx
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8)],ymm0
vmovdqu [rbp-(16+12*8)],ymm0
vmovdqu [rbp-(16+8*8)],ymm0
vmovdqu [rbp-(16+4*8)],ymm0
.L00: ; allocate 16 buckets
lea rdi,[rbp-(16+16*8)+rbx*8] ; destination
mov rdx,[rbp-16]
mov rsi,16
mov rax,16*16384
cmp rdx,rax ; if array is bigger then L2 cache
cmovg rsi,rax ; alignment, L2 cache size
lea rdx,[rdx*4] ; size
call [_posix_memalign] ; huge malloc is actually
; never allocated
; untill page is touched
test rax,rax
jnz .fexit
inc rbx
cmp rbx,16
jnz .L00
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
xor rcx,rcx
.L0:
xor rdx,rdx
vmovd xmm2,ecx
; vpbroadcastd ymm2,xmm1
mov rdi,[rbp-8]
vpslld xmm2,xmm2,2
.L1:; calculate index with SIMD, based on [(num[rdx]>> (ecx << 2))&0xf]
vmovdqu ymm1,[rdi+rdx*4]
;vpsrlvd ymm3,ymm1,ymm2 ; heh, finally got why vector shift
; is usefull
vpsrld ymm3,ymm1,xmm2
vpand ymm3,ymm3,yword[mask]
mov r10,8
vmovdqa ymm15,ymm3
vmovdqa ymm14,ymm1
.L22: ; this is scatter to buckets, according to dword index
; this can be unrolled, but
; I don;t know by how much performance would be increased;
vmovd esi, xmm3
vmovd r11d,xmm1
mov r8,[rbp-(16+16*8)+rsi*8] ; get bucket
vpsrldq xmm3,xmm3,4
mov r9,[rbp-(16+16*8+16*8)+rsi*8] ; get index into backet
inc qword[rbp-(16+16*8+16*8)+rsi*8] ; update index
vpsrldq xmm1,xmm1,4
mov [r8+r9*4],r11d
dec r10
cmp r10,4 ; unfortunatelly vpsrldq will not shift
; upper 128 bits into lower
je .adjust
.L44:
inc rdx
cmp rdx,[rbp-16]
jge .L33
test r10,r10
jnz .L22
jmp .L1
.adjust:
vextracti128 xmm3,ymm15,1
vextracti128 xmm1,ymm14,1
jmp .L44
.L33: ; gather buckets (one after another) into input array
xor rax,rax
lea r8,[rbp-(16+16*8+16*8)]
lea r9,[rbp-(16+16*8)]
.L2:
mov rsi,[r9]
mov r10,[r8]
test r10,r10
jz .L5
.L3:
.L4:
cmp r10,4
jl .one
vmovdqa xmm15,[rsi]
vmovdqu [rdi],xmm15
add rsi,16
add rdi,16
sub r10,4
jnz .L4
jmp .L5
.one:
mov r11d,[rsi]
mov [rdi],r11d
add rsi,4
add rdi,4
dec r10
jnz .one
.L5:
add r8,8
add r9,8
inc rax
cmp rax,16
jnz .L2
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
inc rcx
cmp rcx,8 ; 2*(sizeof(int))
jl .L0
.exit:
xor rbx,rbx
.L11:
mov rdi,[rbp-(16+16*8)+rbx*8]
call [_free]
.next:
inc rbx
cmp rbx,16
jnz .L11
.fexit:
mov rsp,rbp
pop rbp
pop rbx
ret
align 16
; rdi array,rsi size
radix_sort_avx2:
push rbx
push rbp
mov rbp,rsp
sub rsp,16+16*8+16*8
mov [rbp-8],rdi
mov [rbp-16],rsi
xor rbx,rbx
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8)],ymm0
vmovdqu [rbp-(16+12*8)],ymm0
vmovdqu [rbp-(16+8*8)],ymm0
vmovdqu [rbp-(16+4*8)],ymm0
.L00: ; allocate 16 buckets
lea rdi,[rbp-(16+16*8)+rbx*8] ; destination
mov rdx,[rbp-16]
mov rsi,16
mov rax,16*16384
cmp rdx,rax ; if array is bigger then L2 cache
cmovg rsi,rax ; alignment, L2 cache size
lea rdx,[rdx*4] ; size
call [_posix_memalign] ; huge malloc is actually
; never allocated
; untill page is touched
test rax,rax
jnz .fexit
inc rbx
cmp rbx,16
jnz .L00
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
xor rcx,rcx
.L0:
xor rdx,rdx
vmovd xmm2,ecx
; vpbroadcastd ymm2,xmm1
mov rdi,[rbp-8]
vpslld xmm2,xmm2,2
.L1:; calculate index with SIMD, based on [(num[rdx]>> (ecx << 2))&0xf]
vmovdqu ymm1,[rdi+rdx*4]
;vpsrlvd ymm3,ymm1,ymm2 ; heh, finally got why vector shift
; is usefull
vpsrld ymm3,ymm1,xmm2
vpand ymm3,ymm3,yword[mask]
mov r10,8
vmovdqa ymm15,ymm3
vmovdqa ymm14,ymm1
.L22: ; this is scatter to buckets, according to dword index
; this can be unrolled, but
; I don;t know by how much performance would be increased;
vmovd esi, xmm3
vmovd r11d,xmm1
mov r8,[rbp-(16+16*8)+rsi*8] ; get bucket
vpsrldq xmm3,xmm3,4
mov r9,[rbp-(16+16*8+16*8)+rsi*8] ; get index into backet
inc qword[rbp-(16+16*8+16*8)+rsi*8] ; update index
vpsrldq xmm1,xmm1,4
mov [r8+r9*4],r11d
dec r10
cmp r10,4 ; unfortunatelly vpsrldq will not shift
; upper 128 bits into lower
je .adjust
.L44:
inc rdx
cmp rdx,[rbp-16]
jge .L33
test r10,r10
jnz .L22
jmp .L1
.adjust:
vextracti128 xmm3,ymm15,1
vextracti128 xmm1,ymm14,1
jmp .L44
.L33: ; gather buckets (one after another) into input array
xor rax,rax
lea r8,[rbp-(16+16*8+16*8)]
lea r9,[rbp-(16+16*8)]
.L2:
mov rsi,[r9]
mov r10,[r8]
test r10,r10
jz .L5
.L3:
.L4:
cmp r10,4
jl .one
vmovdqa xmm15,[rsi]
vmovdqu [rdi],xmm15
add rsi,16
add rdi,16
sub r10,4
jnz .L4
jmp .L5
.one:
mov r11d,[rsi]
mov [rdi],r11d
add rsi,4
add rdi,4
dec r10
jnz .one
.L5:
add r8,8
add r9,8
inc rax
cmp rax,16
jnz .L2
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
inc rcx
cmp rcx,8 ; 2*(sizeof(int))
jl .L0
.exit:
xor rbx,rbx
.L11:
mov rdi,[rbp-(16+16*8)+rbx*8]
call [_free]
.next:
inc rbx
cmp rbx,16
jnz .L11
.fexit:
mov rsp,rbp
pop rbp
pop rbx
ret
rezultat:
Code:
~/.../examples/sort >>> ./list_sort 1000000
seed: 1538104832
N: 1000000
unsorted
0x103ae5e30 919579
0x103ae5e10 276314
0x103ae5df0 297690
0x103ae5dd0 126247
0x103ae5db0 560392
0x103ae5d90 278668
0x103ae5d70 893887
0x103ae5d50 726189
0x103ae5d30 563301
0x103ae5d10 847808
0x103ae5cf0 32017
0x103ae5cd0 497905
0x103ae5cb0 967740
0x103ae5c90 123023
0x103ae5c70 833748
0x103ae5c50 291432
array radix elapsed 0.027579 seconds
list radix elapsed 0.967346 seconds
list merge elapsed 0.377042 seconds
equal
equal
sorted
0x10392a0d0 0
0x10235cb30 1
0x102d04750 1
0x1027855d0 2
0x1033f1d30 2
0x101ca6030 6
0x101cd7c70 6
0x1030edf10 7
0x1025cf310 9
0x10203e050 10
0x1036bfa50 10
0x1032abf30 11
0x1020d5a30 12
0x1035ae550 12
0x1030fb490 13
0x102ad9490 14
size of node 12, length 1000000
~/.../examples/sort >>> ./list_sort 1000000
seed: 1538104832
N: 1000000
unsorted
0x103ae5e30 919579
0x103ae5e10 276314
0x103ae5df0 297690
0x103ae5dd0 126247
0x103ae5db0 560392
0x103ae5d90 278668
0x103ae5d70 893887
0x103ae5d50 726189
0x103ae5d30 563301
0x103ae5d10 847808
0x103ae5cf0 32017
0x103ae5cd0 497905
0x103ae5cb0 967740
0x103ae5c90 123023
0x103ae5c70 833748
0x103ae5c50 291432
array radix elapsed 0.027579 seconds
list radix elapsed 0.967346 seconds
list merge elapsed 0.377042 seconds
equal
equal
sorted
0x10392a0d0 0
0x10235cb30 1
0x102d04750 1
0x1027855d0 2
0x1033f1d30 2
0x101ca6030 6
0x101cd7c70 6
0x1030edf10 7
0x1025cf310 9
0x10203e050 10
0x1036bfa50 10
0x1032abf30 11
0x1020d5a30 12
0x1035ae550 12
0x1030fb490 13
0x102ad9490 14
size of node 12, length 1000000
sic! merge sort liste je 2 puta brzi od radix sorta liste, zahvaljujuci boljem cache lokalitiju.
E sad radix sort niza je ubedljivi pobednik. Dakle rad sa listama je drasticno sporiji od nizova bez obzira
sto algoritmi imaju slicnu kompleksnost zahvaljujuci tome sto je ram neverovatno spor i cache je svje i svja.