--- testsuite/gcc.target/i386/incoming-7.c (revision 0) +++ testsuite/gcc.target/i386/incoming-7.c (revision 0) @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ --- a/gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) +++ a/gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ --- a/gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) +++ a/gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ --- a/gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) +++ a/gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ --- a/gcc/config/i386/i386.h (revision 150821) +++ a/gcc/config/i386/i386.h (working copy) @@ -2404,6 +2404,8 @@ struct machine_function GTY(()) /* This value is used for amd64 targets and specifies the current abi to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi. */ int call_abi; + /* Use STACK_BOUNDARY for incoming stack boundary. */ + int use_stack_boundary_for_incoming_stack_boundary; }; #define ix86_stack_locals (cfun->machine->stack_locals) --- a/gcc/config/i386/i386.c (revision 150821) +++ a/gcc/config/i386/i386.c (working copy) @@ -8038,11 +8038,19 @@ find_drap_reg (void) static void ix86_update_stack_boundary (void) { + /* Should we use STACK_BOUNDARY for incoming stack boundary? */ + unsigned int incoming_stack_boundary; + + if (cfun->machine->use_stack_boundary_for_incoming_stack_boundary) + incoming_stack_boundary = STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; + /* Prefer the one specified at command line. */ ix86_incoming_stack_boundary = (ix86_user_incoming_stack_boundary ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + : incoming_stack_boundary); /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest @@ -19560,6 +19568,10 @@ ix86_local_alignment (tree exp, enum mac return align; } +#define VALID_SSE_VECTOR_MODE(MODE) \ + ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \ + || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode) + /* Compute the minimum required alignment for dynamic stack realignment purposes for a local variable, parameter or a stack slot. EXP is the data type or decl itself, MODE is its mode and ALIGN is the @@ -19571,7 +19583,7 @@ ix86_minimum_alignment (tree exp, enum m { tree type, decl; - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + if (TARGET_64BIT) return align; if (exp && DECL_P (exp)) @@ -19585,6 +19597,23 @@ ix86_minimum_alignment (tree exp, enum m decl = NULL; } + /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any + SSE variables are put on stack. Since gcc 4.4 may not generate + local vector variables for vectorizer, we also check array with + 128bit user alignment set by vectorizer. */ + if (VALID_SSE_VECTOR_MODE (mode) + || (type + && (VALID_SSE_VECTOR_MODE (TYPE_MODE (type)) + || (flag_tree_vectorize != 0 + && TREE_CODE (type) == ARRAY_TYPE + && decl + && DECL_USER_ALIGN (decl) + && DECL_ALIGN (decl) == 128)))) + cfun->machine->use_stack_boundary_for_incoming_stack_boundary = 1; + + if (align != 64 || ix86_preferred_stack_boundary >= 64) + return align; + /* Don't do dynamic stack realignment for long long objects with -mpreferred-stack-boundary=2. */ if ((mode == DImode || (type && TYPE_MODE (type) == DImode))