File: | build/gcc/tree-ssa-loop-prefetch.cc |
Warning: | line 743, column 38 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* Array prefetching. | |||
2 | Copyright (C) 2005-2023 Free Software Foundation, Inc. | |||
3 | ||||
4 | This file is part of GCC. | |||
5 | ||||
6 | GCC is free software; you can redistribute it and/or modify it | |||
7 | under the terms of the GNU General Public License as published by the | |||
8 | Free Software Foundation; either version 3, or (at your option) any | |||
9 | later version. | |||
10 | ||||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT | |||
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |||
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |||
14 | for more details. | |||
15 | ||||
16 | You should have received a copy of the GNU General Public License | |||
17 | along with GCC; see the file COPYING3. If not see | |||
18 | <http://www.gnu.org/licenses/>. */ | |||
19 | ||||
20 | #include "config.h" | |||
21 | #include "system.h" | |||
22 | #include "coretypes.h" | |||
23 | #include "backend.h" | |||
24 | #include "target.h" | |||
25 | #include "rtl.h" | |||
26 | #include "tree.h" | |||
27 | #include "gimple.h" | |||
28 | #include "predict.h" | |||
29 | #include "tree-pass.h" | |||
30 | #include "gimple-ssa.h" | |||
31 | #include "optabs-query.h" | |||
32 | #include "tree-pretty-print.h" | |||
33 | #include "fold-const.h" | |||
34 | #include "stor-layout.h" | |||
35 | #include "gimplify.h" | |||
36 | #include "gimple-iterator.h" | |||
37 | #include "gimplify-me.h" | |||
38 | #include "tree-ssa-loop-ivopts.h" | |||
39 | #include "tree-ssa-loop-manip.h" | |||
40 | #include "tree-ssa-loop-niter.h" | |||
41 | #include "tree-ssa-loop.h" | |||
42 | #include "ssa.h" | |||
43 | #include "tree-into-ssa.h" | |||
44 | #include "cfgloop.h" | |||
45 | #include "tree-scalar-evolution.h" | |||
46 | #include "langhooks.h" | |||
47 | #include "tree-inline.h" | |||
48 | #include "tree-data-ref.h" | |||
49 | #include "diagnostic-core.h" | |||
50 | #include "dbgcnt.h" | |||
51 | ||||
52 | /* This pass inserts prefetch instructions to optimize cache usage during | |||
53 | accesses to arrays in loops. It processes loops sequentially and: | |||
54 | ||||
55 | 1) Gathers all memory references in the single loop. | |||
56 | 2) For each of the references it decides when it is profitable to prefetch | |||
57 | it. To do it, we evaluate the reuse among the accesses, and determines | |||
58 | two values: PREFETCH_BEFORE (meaning that it only makes sense to do | |||
59 | prefetching in the first PREFETCH_BEFORE iterations of the loop) and | |||
60 | PREFETCH_MOD (meaning that it only makes sense to prefetch in the | |||
61 | iterations of the loop that are zero modulo PREFETCH_MOD). For example | |||
62 | (assuming cache line size is 64 bytes, char has size 1 byte and there | |||
63 | is no hardware sequential prefetch): | |||
64 | ||||
65 | char *a; | |||
66 | for (i = 0; i < max; i++) | |||
67 | { | |||
68 | a[255] = ...; (0) | |||
69 | a[i] = ...; (1) | |||
70 | a[i + 64] = ...; (2) | |||
71 | a[16*i] = ...; (3) | |||
72 | a[187*i] = ...; (4) | |||
73 | a[187*i + 50] = ...; (5) | |||
74 | } | |||
75 | ||||
76 | (0) obviously has PREFETCH_BEFORE 1 | |||
77 | (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory | |||
78 | location 64 iterations before it, and PREFETCH_MOD 64 (since | |||
79 | it hits the same cache line otherwise). | |||
80 | (2) has PREFETCH_MOD 64 | |||
81 | (3) has PREFETCH_MOD 4 | |||
82 | (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since | |||
83 | the cache line accessed by (5) is the same with probability only | |||
84 | 7/32. | |||
85 | (5) has PREFETCH_MOD 1 as well. | |||
86 | ||||
87 | Additionally, we use data dependence analysis to determine for each | |||
88 | reference the distance till the first reuse; this information is used | |||
89 | to determine the temporality of the issued prefetch instruction. | |||
90 | ||||
91 | 3) We determine how much ahead we need to prefetch. The number of | |||
92 | iterations needed is time to fetch / time spent in one iteration of | |||
93 | the loop. The problem is that we do not know either of these values, | |||
94 | so we just make a heuristic guess based on a magic (possibly) | |||
95 | target-specific constant and size of the loop. | |||
96 | ||||
97 | 4) Determine which of the references we prefetch. We take into account | |||
98 | that there is a maximum number of simultaneous prefetches (provided | |||
99 | by machine description). We prefetch as many prefetches as possible | |||
100 | while still within this bound (starting with those with lowest | |||
101 | prefetch_mod, since they are responsible for most of the cache | |||
102 | misses). | |||
103 | ||||
104 | 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD | |||
105 | and PREFETCH_BEFORE requirements (within some bounds), and to avoid | |||
106 | prefetching nonaccessed memory. | |||
107 | TODO -- actually implement peeling. | |||
108 | ||||
109 | 6) We actually emit the prefetch instructions. ??? Perhaps emit the | |||
110 | prefetch instructions with guards in cases where 5) was not sufficient | |||
111 | to satisfy the constraints? | |||
112 | ||||
113 | A cost model is implemented to determine whether or not prefetching is | |||
114 | profitable for a given loop. The cost model has three heuristics: | |||
115 | ||||
116 | 1. Function trip_count_to_ahead_ratio_too_small_p implements a | |||
117 | heuristic that determines whether or not the loop has too few | |||
118 | iterations (compared to ahead). Prefetching is not likely to be | |||
119 | beneficial if the trip count to ahead ratio is below a certain | |||
120 | minimum. | |||
121 | ||||
122 | 2. Function mem_ref_count_reasonable_p implements a heuristic that | |||
123 | determines whether the given loop has enough CPU ops that can be | |||
124 | overlapped with cache missing memory ops. If not, the loop | |||
125 | won't benefit from prefetching. In the implementation, | |||
126 | prefetching is not considered beneficial if the ratio between | |||
127 | the instruction count and the mem ref count is below a certain | |||
128 | minimum. | |||
129 | ||||
130 | 3. Function insn_to_prefetch_ratio_too_small_p implements a | |||
131 | heuristic that disables prefetching in a loop if the prefetching | |||
132 | cost is above a certain limit. The relative prefetching cost is | |||
133 | estimated by taking the ratio between the prefetch count and the | |||
134 | total intruction count (this models the I-cache cost). | |||
135 | ||||
136 | The limits used in these heuristics are defined as parameters with | |||
137 | reasonable default values. Machine-specific default values will be | |||
138 | added later. | |||
139 | ||||
140 | Some other TODO: | |||
141 | -- write and use more general reuse analysis (that could be also used | |||
142 | in other cache aimed loop optimizations) | |||
143 | -- make it behave sanely together with the prefetches given by user | |||
144 | (now we just ignore them; at the very least we should avoid | |||
145 | optimizing loops in that user put his own prefetches) | |||
146 | -- we assume cache line size alignment of arrays; this could be | |||
147 | improved. */ | |||
148 | ||||
149 | /* Magic constants follow. These should be replaced by machine specific | |||
150 | numbers. */ | |||
151 | ||||
152 | /* True if write can be prefetched by a read prefetch. */ | |||
153 | ||||
154 | #ifndef WRITE_CAN_USE_READ_PREFETCH1 | |||
155 | #define WRITE_CAN_USE_READ_PREFETCH1 1 | |||
156 | #endif | |||
157 | ||||
158 | /* True if read can be prefetched by a write prefetch. */ | |||
159 | ||||
160 | #ifndef READ_CAN_USE_WRITE_PREFETCH0 | |||
161 | #define READ_CAN_USE_WRITE_PREFETCH0 0 | |||
162 | #endif | |||
163 | ||||
164 | /* The size of the block loaded by a single prefetch. Usually, this is | |||
165 | the same as cache line size (at the moment, we only consider one level | |||
166 | of cache hierarchy). */ | |||
167 | ||||
168 | #ifndef PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size | |||
169 | #define PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size | |||
170 | #endif | |||
171 | ||||
172 | /* Do we have a forward hardware sequential prefetching? */ | |||
173 | ||||
174 | #ifndef HAVE_FORWARD_PREFETCH0 | |||
175 | #define HAVE_FORWARD_PREFETCH0 0 | |||
176 | #endif | |||
177 | ||||
178 | /* Do we have a backward hardware sequential prefetching? */ | |||
179 | ||||
180 | #ifndef HAVE_BACKWARD_PREFETCH0 | |||
181 | #define HAVE_BACKWARD_PREFETCH0 0 | |||
182 | #endif | |||
183 | ||||
184 | /* In some cases we are only able to determine that there is a certain | |||
185 | probability that the two accesses hit the same cache line. In this | |||
186 | case, we issue the prefetches for both of them if this probability | |||
187 | is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */ | |||
188 | ||||
189 | #ifndef ACCEPTABLE_MISS_RATE50 | |||
190 | #define ACCEPTABLE_MISS_RATE50 50 | |||
191 | #endif | |||
192 | ||||
193 | #define L1_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l1_cache_size * 1024)) ((unsigned) (param_l1_cache_sizeglobal_options.x_param_l1_cache_size * 1024)) | |||
194 | #define L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024)) ((unsigned) (param_l2_cache_sizeglobal_options.x_param_l2_cache_size * 1024)) | |||
195 | ||||
196 | /* We consider a memory access nontemporal if it is not reused sooner than | |||
197 | after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore | |||
198 | accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, | |||
199 | so that we use nontemporal prefetches e.g. if single memory location | |||
200 | is accessed several times in a single iteration of the loop. */ | |||
201 | #define NONTEMPORAL_FRACTION16 16 | |||
202 | ||||
203 | /* In case we have to emit a memory fence instruction after the loop that | |||
204 | uses nontemporal stores, this defines the builtin to use. */ | |||
205 | ||||
206 | #ifndef FENCE_FOLLOWING_MOVNTx86_mfence | |||
207 | #define FENCE_FOLLOWING_MOVNTx86_mfence NULL_TREE(tree) nullptr | |||
208 | #endif | |||
209 | ||||
210 | /* It is not profitable to prefetch when the trip count is not at | |||
211 | least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance. | |||
212 | For example, in a loop with a prefetch ahead distance of 10, | |||
213 | supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is | |||
214 | profitable to prefetch when the trip count is greater or equal to | |||
215 | 40. In that case, 30 out of the 40 iterations will benefit from | |||
216 | prefetching. */ | |||
217 | ||||
218 | #ifndef TRIP_COUNT_TO_AHEAD_RATIO4 | |||
219 | #define TRIP_COUNT_TO_AHEAD_RATIO4 4 | |||
220 | #endif | |||
221 | ||||
222 | /* The group of references between that reuse may occur. */ | |||
223 | ||||
224 | struct mem_ref_group | |||
225 | { | |||
226 | tree base; /* Base of the reference. */ | |||
227 | tree step; /* Step of the reference. */ | |||
228 | struct mem_ref *refs; /* References in the group. */ | |||
229 | struct mem_ref_group *next; /* Next group of references. */ | |||
230 | unsigned int uid; /* Group UID, used only for debugging. */ | |||
231 | }; | |||
232 | ||||
233 | /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ | |||
234 | ||||
235 | #define PREFETCH_ALL-1UL HOST_WIDE_INT_M1U-1UL | |||
236 | ||||
237 | /* Do not generate a prefetch if the unroll factor is significantly less | |||
238 | than what is required by the prefetch. This is to avoid redundant | |||
239 | prefetches. For example, when prefetch_mod is 16 and unroll_factor is | |||
240 | 2, prefetching requires unrolling the loop 16 times, but | |||
241 | the loop is actually unrolled twice. In this case (ratio = 8), | |||
242 | prefetching is not likely to be beneficial. */ | |||
243 | ||||
244 | #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO4 | |||
245 | #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO4 4 | |||
246 | #endif | |||
247 | ||||
248 | /* Some of the prefetch computations have quadratic complexity. We want to | |||
249 | avoid huge compile times and, therefore, want to limit the amount of | |||
250 | memory references per loop where we consider prefetching. */ | |||
251 | ||||
252 | #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP200 | |||
253 | #define PREFETCH_MAX_MEM_REFS_PER_LOOP200 200 | |||
254 | #endif | |||
255 | ||||
256 | /* The memory reference. */ | |||
257 | ||||
258 | struct mem_ref | |||
259 | { | |||
260 | gimple *stmt; /* Statement in that the reference appears. */ | |||
261 | tree mem; /* The reference. */ | |||
262 | HOST_WIDE_INTlong delta; /* Constant offset of the reference. */ | |||
263 | struct mem_ref_group *group; /* The group of references it belongs to. */ | |||
264 | unsigned HOST_WIDE_INTlong prefetch_mod; | |||
265 | /* Prefetch only each PREFETCH_MOD-th | |||
266 | iteration. */ | |||
267 | unsigned HOST_WIDE_INTlong prefetch_before; | |||
268 | /* Prefetch only first PREFETCH_BEFORE | |||
269 | iterations. */ | |||
270 | unsigned reuse_distance; /* The amount of data accessed before the first | |||
271 | reuse of this value. */ | |||
272 | struct mem_ref *next; /* The next reference in the group. */ | |||
273 | unsigned int uid; /* Ref UID, used only for debugging. */ | |||
274 | unsigned write_p : 1; /* Is it a write? */ | |||
275 | unsigned independent_p : 1; /* True if the reference is independent on | |||
276 | all other references inside the loop. */ | |||
277 | unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */ | |||
278 | unsigned storent_p : 1; /* True if we changed the store to a | |||
279 | nontemporal one. */ | |||
280 | }; | |||
281 | ||||
282 | /* Dumps information about memory reference */ | |||
283 | static void | |||
284 | dump_mem_details (FILE *file, tree base, tree step, | |||
285 | HOST_WIDE_INTlong delta, bool write_p) | |||
286 | { | |||
287 | fprintf (file, "(base "); | |||
288 | print_generic_expr (file, base, TDF_SLIM); | |||
289 | fprintf (file, ", step "); | |||
290 | if (cst_and_fits_in_hwi (step)) | |||
291 | fprintf (file, HOST_WIDE_INT_PRINT_DEC"%" "l" "d", int_cst_value (step)); | |||
292 | else | |||
293 | print_generic_expr (file, step, TDF_SLIM); | |||
294 | fprintf (file, ")\n"); | |||
295 | fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC"%" "l" "d" "\n", delta); | |||
296 | fprintf (file, " %s\n\n", write_p ? "write" : "read"); | |||
297 | } | |||
298 | ||||
299 | /* Dumps information about reference REF to FILE. */ | |||
300 | ||||
301 | static void | |||
302 | dump_mem_ref (FILE *file, struct mem_ref *ref) | |||
303 | { | |||
304 | fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid); | |||
305 | print_generic_expr (file, ref->mem, TDF_SLIM); | |||
306 | fprintf (file, ")\n"); | |||
307 | } | |||
308 | ||||
309 | /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not | |||
310 | exist. */ | |||
311 | ||||
312 | static struct mem_ref_group * | |||
313 | find_or_create_group (struct mem_ref_group **groups, tree base, tree step) | |||
314 | { | |||
315 | /* Global count for setting struct mem_ref_group->uid. */ | |||
316 | static unsigned int last_mem_ref_group_uid = 0; | |||
317 | ||||
318 | struct mem_ref_group *group; | |||
319 | ||||
320 | for (; *groups; groups = &(*groups)->next) | |||
321 | { | |||
322 | if (operand_equal_p ((*groups)->step, step, 0) | |||
323 | && operand_equal_p ((*groups)->base, base, 0)) | |||
324 | return *groups; | |||
325 | ||||
326 | /* If step is an integer constant, keep the list of groups sorted | |||
327 | by decreasing step. */ | |||
328 | if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step) | |||
329 | && int_cst_value ((*groups)->step) < int_cst_value (step)) | |||
330 | break; | |||
331 | } | |||
332 | ||||
333 | group = XNEW (struct mem_ref_group)((struct mem_ref_group *) xmalloc (sizeof (struct mem_ref_group ))); | |||
334 | group->base = base; | |||
335 | group->step = step; | |||
336 | group->refs = NULLnullptr; | |||
337 | group->uid = ++last_mem_ref_group_uid; | |||
338 | group->next = *groups; | |||
339 | *groups = group; | |||
340 | ||||
341 | return group; | |||
342 | } | |||
343 | ||||
344 | /* Records a memory reference MEM in GROUP with offset DELTA and write status | |||
345 | WRITE_P. The reference occurs in statement STMT. */ | |||
346 | ||||
347 | static void | |||
348 | record_ref (struct mem_ref_group *group, gimple *stmt, tree mem, | |||
349 | HOST_WIDE_INTlong delta, bool write_p) | |||
350 | { | |||
351 | unsigned int last_mem_ref_uid = 0; | |||
352 | struct mem_ref **aref; | |||
353 | ||||
354 | /* Do not record the same address twice. */ | |||
355 | for (aref = &group->refs; *aref; aref = &(*aref)->next) | |||
356 | { | |||
357 | last_mem_ref_uid = (*aref)->uid; | |||
358 | ||||
359 | /* It does not have to be possible for write reference to reuse the read | |||
360 | prefetch, or vice versa. */ | |||
361 | if (!WRITE_CAN_USE_READ_PREFETCH1 | |||
362 | && write_p | |||
363 | && !(*aref)->write_p) | |||
364 | continue; | |||
365 | if (!READ_CAN_USE_WRITE_PREFETCH0 | |||
366 | && !write_p | |||
367 | && (*aref)->write_p) | |||
368 | continue; | |||
369 | ||||
370 | if ((*aref)->delta == delta) | |||
371 | return; | |||
372 | } | |||
373 | ||||
374 | (*aref) = XNEW (struct mem_ref)((struct mem_ref *) xmalloc (sizeof (struct mem_ref))); | |||
375 | (*aref)->stmt = stmt; | |||
376 | (*aref)->mem = mem; | |||
377 | (*aref)->delta = delta; | |||
378 | (*aref)->write_p = write_p; | |||
379 | (*aref)->prefetch_before = PREFETCH_ALL-1UL; | |||
380 | (*aref)->prefetch_mod = 1; | |||
381 | (*aref)->reuse_distance = 0; | |||
382 | (*aref)->issue_prefetch_p = false; | |||
383 | (*aref)->group = group; | |||
384 | (*aref)->next = NULLnullptr; | |||
385 | (*aref)->independent_p = false; | |||
386 | (*aref)->storent_p = false; | |||
387 | (*aref)->uid = last_mem_ref_uid + 1; | |||
388 | ||||
389 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
390 | { | |||
391 | dump_mem_ref (dump_file, *aref); | |||
392 | ||||
393 | fprintf (dump_file, " group %u ", group->uid); | |||
394 | dump_mem_details (dump_file, group->base, group->step, delta, | |||
395 | write_p); | |||
396 | } | |||
397 | } | |||
398 | ||||
399 | /* Release memory references in GROUPS. */ | |||
400 | ||||
401 | static void | |||
402 | release_mem_refs (struct mem_ref_group *groups) | |||
403 | { | |||
404 | struct mem_ref_group *next_g; | |||
405 | struct mem_ref *ref, *next_r; | |||
406 | ||||
407 | for (; groups; groups = next_g) | |||
408 | { | |||
409 | next_g = groups->next; | |||
410 | for (ref = groups->refs; ref; ref = next_r) | |||
411 | { | |||
412 | next_r = ref->next; | |||
413 | free (ref); | |||
414 | } | |||
415 | free (groups); | |||
416 | } | |||
417 | } | |||
418 | ||||
419 | /* A structure used to pass arguments to idx_analyze_ref. */ | |||
420 | ||||
421 | struct ar_data | |||
422 | { | |||
423 | class loop *loop; /* Loop of the reference. */ | |||
424 | gimple *stmt; /* Statement of the reference. */ | |||
425 | tree *step; /* Step of the memory reference. */ | |||
426 | HOST_WIDE_INTlong *delta; /* Offset of the memory reference. */ | |||
427 | }; | |||
428 | ||||
429 | /* Analyzes a single INDEX of a memory reference to obtain information | |||
430 | described at analyze_ref. Callback for for_each_index. */ | |||
431 | ||||
432 | static bool | |||
433 | idx_analyze_ref (tree base, tree *index, void *data) | |||
434 | { | |||
435 | struct ar_data *ar_data = (struct ar_data *) data; | |||
436 | tree ibase, step, stepsize; | |||
437 | HOST_WIDE_INTlong idelta = 0, imult = 1; | |||
438 | affine_iv iv; | |||
439 | ||||
440 | if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), | |||
441 | *index, &iv, true)) | |||
442 | return false; | |||
443 | ibase = iv.base; | |||
444 | step = iv.step; | |||
445 | ||||
446 | if (TREE_CODE (ibase)((enum tree_code) (ibase)->base.code) == POINTER_PLUS_EXPR | |||
447 | && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)(*((const_cast<tree*> (tree_operand_check ((ibase), (1) , "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 447, __FUNCTION__))))))) | |||
448 | { | |||
449 | idelta = int_cst_value (TREE_OPERAND (ibase, 1)(*((const_cast<tree*> (tree_operand_check ((ibase), (1) , "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 449, __FUNCTION__)))))); | |||
450 | ibase = TREE_OPERAND (ibase, 0)(*((const_cast<tree*> (tree_operand_check ((ibase), (0) , "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 450, __FUNCTION__))))); | |||
451 | } | |||
452 | if (cst_and_fits_in_hwi (ibase)) | |||
453 | { | |||
454 | idelta += int_cst_value (ibase); | |||
455 | ibase = build_int_cst (TREE_TYPE (ibase)((contains_struct_check ((ibase), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 455, __FUNCTION__))->typed.type), 0); | |||
456 | } | |||
457 | ||||
458 | if (TREE_CODE (base)((enum tree_code) (base)->base.code) == ARRAY_REF) | |||
459 | { | |||
460 | stepsize = array_ref_element_size (base); | |||
461 | if (!cst_and_fits_in_hwi (stepsize)) | |||
462 | return false; | |||
463 | imult = int_cst_value (stepsize); | |||
464 | step = fold_build2 (MULT_EXPR, sizetype,fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], step), fold_convert_loc (((location_t) 0 ), sizetype_tab[(int) stk_sizetype], stepsize) ) | |||
465 | fold_convert (sizetype, step),fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], step), fold_convert_loc (((location_t) 0 ), sizetype_tab[(int) stk_sizetype], stepsize) ) | |||
466 | fold_convert (sizetype, stepsize))fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], step), fold_convert_loc (((location_t) 0 ), sizetype_tab[(int) stk_sizetype], stepsize) ); | |||
467 | idelta *= imult; | |||
468 | } | |||
469 | ||||
470 | if (*ar_data->step == NULL_TREE(tree) nullptr) | |||
471 | *ar_data->step = step; | |||
472 | else | |||
473 | *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,fold_build2_loc (((location_t) 0), PLUS_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], *ar_data->step), fold_convert_loc (( (location_t) 0), sizetype_tab[(int) stk_sizetype], step) ) | |||
474 | fold_convert (sizetype, *ar_data->step),fold_build2_loc (((location_t) 0), PLUS_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], *ar_data->step), fold_convert_loc (( (location_t) 0), sizetype_tab[(int) stk_sizetype], step) ) | |||
475 | fold_convert (sizetype, step))fold_build2_loc (((location_t) 0), PLUS_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], *ar_data->step), fold_convert_loc (( (location_t) 0), sizetype_tab[(int) stk_sizetype], step) ); | |||
476 | *ar_data->delta += idelta; | |||
477 | *index = ibase; | |||
478 | ||||
479 | return true; | |||
480 | } | |||
481 | ||||
482 | /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and | |||
483 | STEP are integer constants and iter is number of iterations of LOOP. The | |||
484 | reference occurs in statement STMT. Strips nonaddressable component | |||
485 | references from REF_P. */ | |||
486 | ||||
487 | static bool | |||
488 | analyze_ref (class loop *loop, tree *ref_p, tree *base, | |||
489 | tree *step, HOST_WIDE_INTlong *delta, | |||
490 | gimple *stmt) | |||
491 | { | |||
492 | struct ar_data ar_data; | |||
493 | tree off; | |||
494 | HOST_WIDE_INTlong bit_offset; | |||
495 | tree ref = *ref_p; | |||
496 | ||||
497 | *step = NULL_TREE(tree) nullptr; | |||
498 | *delta = 0; | |||
499 | ||||
500 | /* First strip off the component references. Ignore bitfields. | |||
501 | Also strip off the real and imagine parts of a complex, so that | |||
502 | they can have the same base. */ | |||
503 | if (TREE_CODE (ref)((enum tree_code) (ref)->base.code) == REALPART_EXPR | |||
504 | || TREE_CODE (ref)((enum tree_code) (ref)->base.code) == IMAGPART_EXPR | |||
505 | || (TREE_CODE (ref)((enum tree_code) (ref)->base.code) == COMPONENT_REF | |||
506 | && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))((tree_check (((*((const_cast<tree*> (tree_operand_check ((ref), (1), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 506, __FUNCTION__)))))), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 506, __FUNCTION__, (FIELD_DECL)))->decl_common.decl_flag_2 ))) | |||
507 | { | |||
508 | if (TREE_CODE (ref)((enum tree_code) (ref)->base.code) == IMAGPART_EXPR) | |||
509 | *delta += int_size_in_bytes (TREE_TYPE (ref)((contains_struct_check ((ref), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 509, __FUNCTION__))->typed.type)); | |||
510 | ref = TREE_OPERAND (ref, 0)(*((const_cast<tree*> (tree_operand_check ((ref), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 510, __FUNCTION__))))); | |||
511 | } | |||
512 | ||||
513 | *ref_p = ref; | |||
514 | ||||
515 | for (; TREE_CODE (ref)((enum tree_code) (ref)->base.code) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)(*((const_cast<tree*> (tree_operand_check ((ref), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 515, __FUNCTION__)))))) | |||
516 | { | |||
517 | off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1))((tree_check (((*((const_cast<tree*> (tree_operand_check ((ref), (1), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 517, __FUNCTION__)))))), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 517, __FUNCTION__, (FIELD_DECL)))->field_decl.bit_offset ); | |||
518 | bit_offset = TREE_INT_CST_LOW (off)((unsigned long) (*tree_int_cst_elt_check ((off), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 518, __FUNCTION__))); | |||
519 | gcc_assert (bit_offset % BITS_PER_UNIT == 0)((void)(!(bit_offset % (8) == 0) ? fancy_abort ("/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 519, __FUNCTION__), 0 : 0)); | |||
520 | ||||
521 | *delta += bit_offset / BITS_PER_UNIT(8); | |||
522 | } | |||
523 | ||||
524 | *base = unshare_expr (ref); | |||
525 | ar_data.loop = loop; | |||
526 | ar_data.stmt = stmt; | |||
527 | ar_data.step = step; | |||
528 | ar_data.delta = delta; | |||
529 | return for_each_index (base, idx_analyze_ref, &ar_data); | |||
530 | } | |||
531 | ||||
532 | /* Record a memory reference REF to the list REFS. The reference occurs in | |||
533 | LOOP in statement STMT and it is write if WRITE_P. Returns true if the | |||
534 | reference was recorded, false otherwise. */ | |||
535 | ||||
536 | static bool | |||
537 | gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs, | |||
538 | tree ref, bool write_p, gimple *stmt) | |||
539 | { | |||
540 | tree base, step; | |||
541 | HOST_WIDE_INTlong delta; | |||
542 | struct mem_ref_group *agrp; | |||
543 | ||||
544 | if (get_base_address (ref) == NULLnullptr) | |||
545 | return false; | |||
546 | ||||
547 | if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) | |||
548 | return false; | |||
549 | /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ | |||
550 | if (step == NULL_TREE(tree) nullptr) | |||
551 | return false; | |||
552 | ||||
553 | /* Stop if the address of BASE could not be taken. */ | |||
554 | if (may_be_nonaddressable_p (base)) | |||
555 | return false; | |||
556 | ||||
557 | /* Limit non-constant step prefetching only to the innermost loops and | |||
558 | only when the step is loop invariant in the entire loop nest. */ | |||
559 | if (!cst_and_fits_in_hwi (step)) | |||
560 | { | |||
561 | if (loop->inner != NULLnullptr) | |||
562 | { | |||
563 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
564 | { | |||
565 | fprintf (dump_file, "Memory expression %p\n",(void *) ref ); | |||
566 | print_generic_expr (dump_file, ref, TDF_SLIM); | |||
567 | fprintf (dump_file,":"); | |||
568 | dump_mem_details (dump_file, base, step, delta, write_p); | |||
569 | fprintf (dump_file, | |||
570 | "Ignoring %p, non-constant step prefetching is " | |||
571 | "limited to inner most loops \n", | |||
572 | (void *) ref); | |||
573 | } | |||
574 | return false; | |||
575 | } | |||
576 | else | |||
577 | { | |||
578 | if (!expr_invariant_in_loop_p (loop_outermost (loop), step)) | |||
579 | { | |||
580 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
581 | { | |||
582 | fprintf (dump_file, "Memory expression %p\n",(void *) ref ); | |||
583 | print_generic_expr (dump_file, ref, TDF_SLIM); | |||
584 | fprintf (dump_file,":"); | |||
585 | dump_mem_details (dump_file, base, step, delta, write_p); | |||
586 | fprintf (dump_file, | |||
587 | "Not prefetching, ignoring %p due to " | |||
588 | "loop variant step\n", | |||
589 | (void *) ref); | |||
590 | } | |||
591 | return false; | |||
592 | } | |||
593 | } | |||
594 | } | |||
595 | ||||
596 | /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP | |||
597 | are integer constants. */ | |||
598 | agrp = find_or_create_group (refs, base, step); | |||
599 | record_ref (agrp, stmt, ref, delta, write_p); | |||
600 | ||||
601 | return true; | |||
602 | } | |||
603 | ||||
604 | /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to | |||
605 | true if there are no other memory references inside the loop. */ | |||
606 | ||||
607 | static struct mem_ref_group * | |||
608 | gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count) | |||
609 | { | |||
610 | basic_block *body = get_loop_body_in_dom_order (loop); | |||
611 | basic_block bb; | |||
612 | unsigned i; | |||
613 | gimple_stmt_iterator bsi; | |||
614 | gimple *stmt; | |||
615 | tree lhs, rhs; | |||
616 | struct mem_ref_group *refs = NULLnullptr; | |||
617 | ||||
618 | *no_other_refs = true; | |||
619 | *ref_count = 0; | |||
620 | ||||
621 | /* Scan the loop body in order, so that the former references precede the | |||
622 | later ones. */ | |||
623 | for (i = 0; i < loop->num_nodes; i++) | |||
624 | { | |||
625 | bb = body[i]; | |||
626 | if (bb->loop_father != loop) | |||
627 | continue; | |||
628 | ||||
629 | for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) | |||
630 | { | |||
631 | stmt = gsi_stmt (bsi); | |||
632 | ||||
633 | if (gimple_code (stmt) != GIMPLE_ASSIGN) | |||
634 | { | |||
635 | if (gimple_vuse (stmt) | |||
636 | || (is_gimple_call (stmt) | |||
637 | && !(gimple_call_flags (stmt) & ECF_CONST(1 << 0)))) | |||
638 | *no_other_refs = false; | |||
639 | continue; | |||
640 | } | |||
641 | ||||
642 | if (! gimple_vuse (stmt)) | |||
643 | continue; | |||
644 | ||||
645 | lhs = gimple_assign_lhs (stmt); | |||
646 | rhs = gimple_assign_rhs1 (stmt); | |||
647 | ||||
648 | if (REFERENCE_CLASS_P (rhs)(tree_code_type_tmpl <0>::tree_code_type[(int) (((enum tree_code ) (rhs)->base.code))] == tcc_reference)) | |||
649 | { | |||
650 | *no_other_refs &= gather_memory_references_ref (loop, &refs, | |||
651 | rhs, false, stmt); | |||
652 | *ref_count += 1; | |||
653 | } | |||
654 | if (REFERENCE_CLASS_P (lhs)(tree_code_type_tmpl <0>::tree_code_type[(int) (((enum tree_code ) (lhs)->base.code))] == tcc_reference)) | |||
655 | { | |||
656 | *no_other_refs &= gather_memory_references_ref (loop, &refs, | |||
657 | lhs, true, stmt); | |||
658 | *ref_count += 1; | |||
659 | } | |||
660 | } | |||
661 | } | |||
662 | free (body); | |||
663 | ||||
664 | return refs; | |||
665 | } | |||
666 | ||||
667 | /* Prune the prefetch candidate REF using the self-reuse. */ | |||
668 | ||||
669 | static void | |||
670 | prune_ref_by_self_reuse (struct mem_ref *ref) | |||
671 | { | |||
672 | HOST_WIDE_INTlong step; | |||
673 | bool backward; | |||
674 | ||||
675 | /* If the step size is non constant, we cannot calculate prefetch_mod. */ | |||
676 | if (!cst_and_fits_in_hwi (ref->group->step)) | |||
677 | return; | |||
678 | ||||
679 | step = int_cst_value (ref->group->step); | |||
680 | ||||
681 | backward = step < 0; | |||
682 | ||||
683 | if (step == 0) | |||
684 | { | |||
685 | /* Prefetch references to invariant address just once. */ | |||
686 | ref->prefetch_before = 1; | |||
687 | return; | |||
688 | } | |||
689 | ||||
690 | if (backward) | |||
691 | step = -step; | |||
692 | ||||
693 | if (step > PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size) | |||
694 | return; | |||
695 | ||||
696 | if ((backward && HAVE_BACKWARD_PREFETCH0) | |||
697 | || (!backward && HAVE_FORWARD_PREFETCH0)) | |||
698 | { | |||
699 | ref->prefetch_before = 1; | |||
700 | return; | |||
701 | } | |||
702 | ||||
703 | ref->prefetch_mod = PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size / step; | |||
704 | } | |||
705 | ||||
706 | /* Divides X by BY, rounding down. */ | |||
707 | ||||
708 | static HOST_WIDE_INTlong | |||
709 | ddown (HOST_WIDE_INTlong x, unsigned HOST_WIDE_INTlong by) | |||
710 | { | |||
711 | gcc_assert (by > 0)((void)(!(by > 0) ? fancy_abort ("/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 711, __FUNCTION__), 0 : 0)); | |||
712 | ||||
713 | if (x >= 0) | |||
714 | return x / (HOST_WIDE_INTlong) by; | |||
715 | else | |||
716 | return (x + (HOST_WIDE_INTlong) by - 1) / (HOST_WIDE_INTlong) by; | |||
717 | } | |||
718 | ||||
719 | /* Given a CACHE_LINE_SIZE and two inductive memory references | |||
720 | with a common STEP greater than CACHE_LINE_SIZE and an address | |||
721 | difference DELTA, compute the probability that they will fall | |||
722 | in different cache lines. Return true if the computed miss rate | |||
723 | is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the | |||
724 | number of distinct iterations after which the pattern repeats itself. | |||
725 | ALIGN_UNIT is the unit of alignment in bytes. */ | |||
726 | ||||
727 | static bool | |||
728 | is_miss_rate_acceptable (unsigned HOST_WIDE_INTlong cache_line_size, | |||
729 | HOST_WIDE_INTlong step, HOST_WIDE_INTlong delta, | |||
730 | unsigned HOST_WIDE_INTlong distinct_iters, | |||
731 | int align_unit) | |||
732 | { | |||
733 | unsigned align, iter; | |||
734 | int total_positions, miss_positions, max_allowed_miss_positions; | |||
735 | int address1, address2, cache_line1, cache_line2; | |||
736 | ||||
737 | /* It always misses if delta is greater than or equal to the cache | |||
738 | line size. */ | |||
739 | if (delta >= (HOST_WIDE_INTlong) cache_line_size) | |||
740 | return false; | |||
741 | ||||
742 | miss_positions = 0; | |||
743 | total_positions = (cache_line_size / align_unit) * distinct_iters; | |||
| ||||
744 | max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE50 * total_positions) / 1000; | |||
745 | ||||
746 | /* Iterate through all possible alignments of the first | |||
747 | memory reference within its cache line. */ | |||
748 | for (align = 0; align < cache_line_size; align += align_unit) | |||
749 | ||||
750 | /* Iterate through all distinct iterations. */ | |||
751 | for (iter = 0; iter < distinct_iters; iter++) | |||
752 | { | |||
753 | address1 = align + step * iter; | |||
754 | address2 = address1 + delta; | |||
755 | cache_line1 = address1 / cache_line_size; | |||
756 | cache_line2 = address2 / cache_line_size; | |||
757 | if (cache_line1 != cache_line2) | |||
758 | { | |||
759 | miss_positions += 1; | |||
760 | if (miss_positions > max_allowed_miss_positions) | |||
761 | return false; | |||
762 | } | |||
763 | } | |||
764 | return true; | |||
765 | } | |||
766 | ||||
767 | /* Prune the prefetch candidate REF using the reuse with BY. | |||
768 | If BY_IS_BEFORE is true, BY is before REF in the loop. */ | |||
769 | ||||
770 | static void | |||
771 | prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, | |||
772 | bool by_is_before) | |||
773 | { | |||
774 | HOST_WIDE_INTlong step; | |||
775 | bool backward; | |||
776 | HOST_WIDE_INTlong delta_r = ref->delta, delta_b = by->delta; | |||
777 | HOST_WIDE_INTlong delta = delta_b - delta_r; | |||
778 | HOST_WIDE_INTlong hit_from; | |||
779 | unsigned HOST_WIDE_INTlong prefetch_before, prefetch_block; | |||
780 | HOST_WIDE_INTlong reduced_step; | |||
781 | unsigned HOST_WIDE_INTlong reduced_prefetch_block; | |||
782 | tree ref_type; | |||
783 | int align_unit; | |||
784 | ||||
785 | /* If the step is non constant we cannot calculate prefetch_before. */ | |||
786 | if (!cst_and_fits_in_hwi (ref->group->step)) { | |||
787 | return; | |||
788 | } | |||
789 | ||||
790 | step = int_cst_value (ref->group->step); | |||
791 | ||||
792 | backward = step < 0; | |||
793 | ||||
794 | ||||
795 | if (delta == 0) | |||
796 | { | |||
797 | /* If the references has the same address, only prefetch the | |||
798 | former. */ | |||
799 | if (by_is_before) | |||
800 | ref->prefetch_before = 0; | |||
801 | ||||
802 | return; | |||
803 | } | |||
804 | ||||
805 | if (!step) | |||
806 | { | |||
807 | /* If the reference addresses are invariant and fall into the | |||
808 | same cache line, prefetch just the first one. */ | |||
809 | if (!by_is_before) | |||
810 | return; | |||
811 | ||||
812 | if (ddown (ref->delta, PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size) | |||
813 | != ddown (by->delta, PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size)) | |||
814 | return; | |||
815 | ||||
816 | ref->prefetch_before = 0; | |||
817 | return; | |||
818 | } | |||
819 | ||||
820 | /* Only prune the reference that is behind in the array. */ | |||
821 | if (backward
| |||
822 | { | |||
823 | if (delta > 0) | |||
824 | return; | |||
825 | ||||
826 | /* Transform the data so that we may assume that the accesses | |||
827 | are forward. */ | |||
828 | delta = - delta; | |||
829 | step = -step; | |||
830 | delta_r = PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size - 1 - delta_r; | |||
831 | delta_b = PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size - 1 - delta_b; | |||
832 | } | |||
833 | else | |||
834 | { | |||
835 | if (delta < 0) | |||
836 | return; | |||
837 | } | |||
838 | ||||
839 | /* Check whether the two references are likely to hit the same cache | |||
840 | line, and how distant the iterations in that it occurs are from | |||
841 | each other. */ | |||
842 | ||||
843 | if (step <= PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size) | |||
844 | { | |||
845 | /* The accesses are sure to meet. Let us check when. */ | |||
846 | hit_from = ddown (delta_b, PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size) * PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size; | |||
847 | prefetch_before = (hit_from - delta_r + step - 1) / step; | |||
848 | ||||
849 | /* Do not reduce prefetch_before if we meet beyond cache size. */ | |||
850 | if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024)) / step)) | |||
851 | prefetch_before = PREFETCH_ALL-1UL; | |||
852 | if (prefetch_before < ref->prefetch_before) | |||
853 | ref->prefetch_before = prefetch_before; | |||
854 | ||||
855 | return; | |||
856 | } | |||
857 | ||||
858 | /* A more complicated case with step > prefetch_block. First reduce | |||
859 | the ratio between the step and the cache line size to its simplest | |||
860 | terms. The resulting denominator will then represent the number of | |||
861 | distinct iterations after which each address will go back to its | |||
862 | initial location within the cache line. This computation assumes | |||
863 | that PREFETCH_BLOCK is a power of two. */ | |||
864 | prefetch_block = PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size; | |||
865 | reduced_prefetch_block = prefetch_block; | |||
866 | reduced_step = step; | |||
867 | while ((reduced_step & 1) == 0 | |||
868 | && reduced_prefetch_block > 1) | |||
869 | { | |||
870 | reduced_step >>= 1; | |||
871 | reduced_prefetch_block >>= 1; | |||
872 | } | |||
873 | ||||
874 | prefetch_before = delta / step; | |||
875 | delta %= step; | |||
876 | ref_type = TREE_TYPE (ref->mem)((contains_struct_check ((ref->mem), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 876, __FUNCTION__))->typed.type); | |||
877 | align_unit = TYPE_ALIGN (ref_type)(((tree_class_check ((ref_type), (tcc_type), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 877, __FUNCTION__))->type_common.align) ? ((unsigned)1) << (((tree_class_check ((ref_type), (tcc_type), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 877, __FUNCTION__))->type_common.align) - 1) : 0) / 8; | |||
878 | if (is_miss_rate_acceptable (prefetch_block, step, delta, | |||
879 | reduced_prefetch_block, align_unit)) | |||
880 | { | |||
881 | /* Do not reduce prefetch_before if we meet beyond cache size. */ | |||
882 | if (prefetch_before > L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024)) / PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size) | |||
883 | prefetch_before = PREFETCH_ALL-1UL; | |||
884 | if (prefetch_before < ref->prefetch_before) | |||
885 | ref->prefetch_before = prefetch_before; | |||
886 | ||||
887 | return; | |||
888 | } | |||
889 | ||||
890 | /* Try also the following iteration. */ | |||
891 | prefetch_before++; | |||
892 | delta = step - delta; | |||
893 | if (is_miss_rate_acceptable (prefetch_block, step, delta, | |||
894 | reduced_prefetch_block, align_unit)) | |||
895 | { | |||
896 | if (prefetch_before < ref->prefetch_before) | |||
897 | ref->prefetch_before = prefetch_before; | |||
898 | ||||
899 | return; | |||
900 | } | |||
901 | ||||
902 | /* The ref probably does not reuse by. */ | |||
903 | return; | |||
904 | } | |||
905 | ||||
906 | /* Prune the prefetch candidate REF using the reuses with other references | |||
907 | in REFS. */ | |||
908 | ||||
909 | static void | |||
910 | prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) | |||
911 | { | |||
912 | struct mem_ref *prune_by; | |||
913 | bool before = true; | |||
914 | ||||
915 | prune_ref_by_self_reuse (ref); | |||
916 | ||||
917 | for (prune_by = refs; prune_by; prune_by = prune_by->next) | |||
| ||||
918 | { | |||
919 | if (prune_by == ref) | |||
920 | { | |||
921 | before = false; | |||
922 | continue; | |||
923 | } | |||
924 | ||||
925 | if (!WRITE_CAN_USE_READ_PREFETCH1 | |||
926 | && ref->write_p | |||
927 | && !prune_by->write_p) | |||
928 | continue; | |||
929 | if (!READ_CAN_USE_WRITE_PREFETCH0 | |||
930 | && !ref->write_p | |||
931 | && prune_by->write_p) | |||
932 | continue; | |||
933 | ||||
934 | prune_ref_by_group_reuse (ref, prune_by, before); | |||
935 | } | |||
936 | } | |||
937 | ||||
938 | /* Prune the prefetch candidates in GROUP using the reuse analysis. */ | |||
939 | ||||
940 | static void | |||
941 | prune_group_by_reuse (struct mem_ref_group *group) | |||
942 | { | |||
943 | struct mem_ref *ref_pruned; | |||
944 | ||||
945 | for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) | |||
946 | { | |||
947 | prune_ref_by_reuse (ref_pruned, group->refs); | |||
948 | ||||
949 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
950 | { | |||
951 | dump_mem_ref (dump_file, ref_pruned); | |||
952 | ||||
953 | if (ref_pruned->prefetch_before == PREFETCH_ALL-1UL | |||
954 | && ref_pruned->prefetch_mod == 1) | |||
955 | fprintf (dump_file, " no restrictions"); | |||
956 | else if (ref_pruned->prefetch_before == 0) | |||
957 | fprintf (dump_file, " do not prefetch"); | |||
958 | else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) | |||
959 | fprintf (dump_file, " prefetch once"); | |||
960 | else | |||
961 | { | |||
962 | if (ref_pruned->prefetch_before != PREFETCH_ALL-1UL) | |||
963 | { | |||
964 | fprintf (dump_file, " prefetch before "); | |||
965 | fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC"%" "l" "d", | |||
966 | ref_pruned->prefetch_before); | |||
967 | } | |||
968 | if (ref_pruned->prefetch_mod != 1) | |||
969 | { | |||
970 | fprintf (dump_file, " prefetch mod "); | |||
971 | fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC"%" "l" "d", | |||
972 | ref_pruned->prefetch_mod); | |||
973 | } | |||
974 | } | |||
975 | fprintf (dump_file, "\n"); | |||
976 | } | |||
977 | } | |||
978 | } | |||
979 | ||||
980 | /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ | |||
981 | ||||
982 | static void | |||
983 | prune_by_reuse (struct mem_ref_group *groups) | |||
984 | { | |||
985 | for (; groups; groups = groups->next) | |||
986 | prune_group_by_reuse (groups); | |||
987 | } | |||
988 | ||||
989 | /* Returns true if we should issue prefetch for REF. */ | |||
990 | ||||
991 | static bool | |||
992 | should_issue_prefetch_p (struct mem_ref *ref) | |||
993 | { | |||
994 | /* Do we want to issue prefetches for non-constant strides? */ | |||
995 | if (!cst_and_fits_in_hwi (ref->group->step) | |||
996 | && param_prefetch_dynamic_stridesglobal_options.x_param_prefetch_dynamic_strides == 0) | |||
997 | { | |||
998 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
999 | fprintf (dump_file, | |||
1000 | "Skipping non-constant step for reference %u:%u\n", | |||
1001 | ref->group->uid, ref->uid); | |||
1002 | return false; | |||
1003 | } | |||
1004 | ||||
1005 | /* Some processors may have a hardware prefetcher that may conflict with | |||
1006 | prefetch hints for a range of strides. Make sure we don't issue | |||
1007 | prefetches for such cases if the stride is within this particular | |||
1008 | range. */ | |||
1009 | if (cst_and_fits_in_hwi (ref->group->step) | |||
1010 | && abs_hwi (int_cst_value (ref->group->step)) | |||
1011 | < (HOST_WIDE_INTlong) param_prefetch_minimum_strideglobal_options.x_param_prefetch_minimum_stride) | |||
1012 | { | |||
1013 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1014 | fprintf (dump_file, | |||
1015 | "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC"%" "l" "d" | |||
1016 | ") is less than the mininum required stride of %d\n", | |||
1017 | ref->group->uid, ref->uid, int_cst_value (ref->group->step), | |||
1018 | param_prefetch_minimum_strideglobal_options.x_param_prefetch_minimum_stride); | |||
1019 | return false; | |||
1020 | } | |||
1021 | ||||
1022 | /* For now do not issue prefetches for only first few of the | |||
1023 | iterations. */ | |||
1024 | if (ref->prefetch_before != PREFETCH_ALL-1UL) | |||
1025 | { | |||
1026 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1027 | fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n", | |||
1028 | ref->group->uid, ref->uid); | |||
1029 | return false; | |||
1030 | } | |||
1031 | ||||
1032 | /* Do not prefetch nontemporal stores. */ | |||
1033 | if (ref->storent_p) | |||
1034 | { | |||
1035 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1036 | fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid); | |||
1037 | return false; | |||
1038 | } | |||
1039 | ||||
1040 | return true; | |||
1041 | } | |||
1042 | ||||
1043 | /* Decide which of the prefetch candidates in GROUPS to prefetch. | |||
1044 | AHEAD is the number of iterations to prefetch ahead (which corresponds | |||
1045 | to the number of simultaneous instances of one prefetch running at a | |||
1046 | time). UNROLL_FACTOR is the factor by that the loop is going to be | |||
1047 | unrolled. Returns true if there is anything to prefetch. */ | |||
1048 | ||||
1049 | static bool | |||
1050 | schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, | |||
1051 | unsigned ahead) | |||
1052 | { | |||
1053 | unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots; | |||
1054 | unsigned slots_per_prefetch; | |||
1055 | struct mem_ref *ref; | |||
1056 | bool any = false; | |||
1057 | ||||
1058 | /* At most param_simultaneous_prefetches should be running | |||
1059 | at the same time. */ | |||
1060 | remaining_prefetch_slots = param_simultaneous_prefetchesglobal_options.x_param_simultaneous_prefetches; | |||
1061 | ||||
1062 | /* The prefetch will run for AHEAD iterations of the original loop, i.e., | |||
1063 | AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, | |||
1064 | it will need a prefetch slot. */ | |||
1065 | slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor; | |||
1066 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1067 | fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n", | |||
1068 | slots_per_prefetch); | |||
1069 | ||||
1070 | /* For now we just take memory references one by one and issue | |||
1071 | prefetches for as many as possible. The groups are sorted | |||
1072 | starting with the largest step, since the references with | |||
1073 | large step are more likely to cause many cache misses. */ | |||
1074 | ||||
1075 | for (; groups; groups = groups->next) | |||
1076 | for (ref = groups->refs; ref; ref = ref->next) | |||
1077 | { | |||
1078 | if (!should_issue_prefetch_p (ref)) | |||
1079 | continue; | |||
1080 | ||||
1081 | /* The loop is far from being sufficiently unrolled for this | |||
1082 | prefetch. Do not generate prefetch to avoid many redudant | |||
1083 | prefetches. */ | |||
1084 | if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO4) | |||
1085 | continue; | |||
1086 | ||||
1087 | /* If we need to prefetch the reference each PREFETCH_MOD iterations, | |||
1088 | and we unroll the loop UNROLL_FACTOR times, we need to insert | |||
1089 | ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each | |||
1090 | iteration. */ | |||
1091 | n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) | |||
1092 | / ref->prefetch_mod); | |||
1093 | prefetch_slots = n_prefetches * slots_per_prefetch; | |||
1094 | ||||
1095 | /* If more than half of the prefetches would be lost anyway, do not | |||
1096 | issue the prefetch. */ | |||
1097 | if (2 * remaining_prefetch_slots < prefetch_slots) | |||
1098 | continue; | |||
1099 | ||||
1100 | /* Stop prefetching if debug counter is activated. */ | |||
1101 | if (!dbg_cnt (prefetch)) | |||
1102 | continue; | |||
1103 | ||||
1104 | ref->issue_prefetch_p = true; | |||
1105 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1106 | fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n", | |||
1107 | ref->group->uid, ref->uid); | |||
1108 | ||||
1109 | if (remaining_prefetch_slots <= prefetch_slots) | |||
1110 | return true; | |||
1111 | remaining_prefetch_slots -= prefetch_slots; | |||
1112 | any = true; | |||
1113 | } | |||
1114 | ||||
1115 | return any; | |||
1116 | } | |||
1117 | ||||
1118 | /* Return TRUE if no prefetch is going to be generated in the given | |||
1119 | GROUPS. */ | |||
1120 | ||||
1121 | static bool | |||
1122 | nothing_to_prefetch_p (struct mem_ref_group *groups) | |||
1123 | { | |||
1124 | struct mem_ref *ref; | |||
1125 | ||||
1126 | for (; groups; groups = groups->next) | |||
1127 | for (ref = groups->refs; ref; ref = ref->next) | |||
1128 | if (should_issue_prefetch_p (ref)) | |||
1129 | return false; | |||
1130 | ||||
1131 | return true; | |||
1132 | } | |||
1133 | ||||
1134 | /* Estimate the number of prefetches in the given GROUPS. | |||
1135 | UNROLL_FACTOR is the factor by which LOOP was unrolled. */ | |||
1136 | ||||
1137 | static int | |||
1138 | estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor) | |||
1139 | { | |||
1140 | struct mem_ref *ref; | |||
1141 | unsigned n_prefetches; | |||
1142 | int prefetch_count = 0; | |||
1143 | ||||
1144 | for (; groups; groups = groups->next) | |||
1145 | for (ref = groups->refs; ref; ref = ref->next) | |||
1146 | if (should_issue_prefetch_p (ref)) | |||
1147 | { | |||
1148 | n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) | |||
1149 | / ref->prefetch_mod); | |||
1150 | prefetch_count += n_prefetches; | |||
1151 | } | |||
1152 | ||||
1153 | return prefetch_count; | |||
1154 | } | |||
1155 | ||||
1156 | /* Issue prefetches for the reference REF into loop as decided before. | |||
1157 | HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR | |||
1158 | is the factor by which LOOP was unrolled. */ | |||
1159 | ||||
1160 | static void | |||
1161 | issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) | |||
1162 | { | |||
1163 | HOST_WIDE_INTlong delta; | |||
1164 | tree addr, addr_base, write_p, local, forward; | |||
1165 | gcall *prefetch; | |||
1166 | gimple_stmt_iterator bsi; | |||
1167 | unsigned n_prefetches, ap; | |||
1168 | bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024)); | |||
1169 | ||||
1170 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1171 | fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", | |||
1172 | nontemporal ? " nontemporal" : "", | |||
1173 | ref->group->uid, ref->uid); | |||
1174 | ||||
1175 | bsi = gsi_for_stmt (ref->stmt); | |||
1176 | ||||
1177 | n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) | |||
1178 | / ref->prefetch_mod); | |||
1179 | addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node)build_fold_addr_expr_with_type_loc (((location_t) 0), (ref-> mem), global_trees[TI_PTR_TYPE]); | |||
1180 | addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), | |||
1181 | true, NULLnullptr, true, GSI_SAME_STMT); | |||
1182 | write_p = ref->write_p ? integer_one_nodeglobal_trees[TI_INTEGER_ONE] : integer_zero_nodeglobal_trees[TI_INTEGER_ZERO]; | |||
1183 | local = nontemporal ? integer_zero_nodeglobal_trees[TI_INTEGER_ZERO] : integer_three_nodeglobal_trees[TI_INTEGER_THREE]; | |||
1184 | ||||
1185 | for (ap = 0; ap < n_prefetches; ap++) | |||
1186 | { | |||
1187 | if (cst_and_fits_in_hwi (ref->group->step)) | |||
1188 | { | |||
1189 | /* Determine the address to prefetch. */ | |||
1190 | delta = (ahead + ap * ref->prefetch_mod) * | |||
1191 | int_cst_value (ref->group->step); | |||
1192 | addr = fold_build_pointer_plus_hwi (addr_base, delta)fold_build_pointer_plus_hwi_loc (((location_t) 0), addr_base, delta); | |||
1193 | addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, | |||
1194 | NULLnullptr, true, GSI_SAME_STMT); | |||
1195 | } | |||
1196 | else | |||
1197 | { | |||
1198 | /* The step size is non-constant but loop-invariant. We use the | |||
1199 | heuristic to simply prefetch ahead iterations ahead. */ | |||
1200 | forward = fold_build2 (MULT_EXPR, sizetype,fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], ref->group->step), fold_convert_loc (((location_t) 0), sizetype_tab[(int) stk_sizetype], size_int_kind (ahead, stk_sizetype)) ) | |||
1201 | fold_convert (sizetype, ref->group->step),fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], ref->group->step), fold_convert_loc (((location_t) 0), sizetype_tab[(int) stk_sizetype], size_int_kind (ahead, stk_sizetype)) ) | |||
1202 | fold_convert (sizetype, size_int (ahead)))fold_build2_loc (((location_t) 0), MULT_EXPR, sizetype_tab[(int ) stk_sizetype], fold_convert_loc (((location_t) 0), sizetype_tab [(int) stk_sizetype], ref->group->step), fold_convert_loc (((location_t) 0), sizetype_tab[(int) stk_sizetype], size_int_kind (ahead, stk_sizetype)) ); | |||
1203 | addr = fold_build_pointer_plus (addr_base, forward)fold_build_pointer_plus_loc (((location_t) 0), addr_base, forward ); | |||
1204 | addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, | |||
1205 | NULLnullptr, true, GSI_SAME_STMT); | |||
1206 | } | |||
1207 | ||||
1208 | if (addr_base != addr | |||
1209 | && TREE_CODE (addr_base)((enum tree_code) (addr_base)->base.code) == SSA_NAME | |||
1210 | && TREE_CODE (addr)((enum tree_code) (addr)->base.code) == SSA_NAME) | |||
1211 | { | |||
1212 | duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base)(tree_check ((addr_base), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1212, __FUNCTION__, (SSA_NAME)))->ssa_name.info.ptr_info); | |||
1213 | /* As this isn't a plain copy we have to reset alignment | |||
1214 | information. */ | |||
1215 | if (SSA_NAME_PTR_INFO (addr)(tree_check ((addr), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1215, __FUNCTION__, (SSA_NAME)))->ssa_name.info.ptr_info) | |||
1216 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr)(tree_check ((addr), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1216, __FUNCTION__, (SSA_NAME)))->ssa_name.info.ptr_info); | |||
1217 | } | |||
1218 | ||||
1219 | /* Create the prefetch instruction. */ | |||
1220 | prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), | |||
1221 | 3, addr, write_p, local); | |||
1222 | gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); | |||
1223 | } | |||
1224 | } | |||
1225 | ||||
1226 | /* Issue prefetches for the references in GROUPS into loop as decided before. | |||
1227 | HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the | |||
1228 | factor by that LOOP was unrolled. */ | |||
1229 | ||||
1230 | static void | |||
1231 | issue_prefetches (struct mem_ref_group *groups, | |||
1232 | unsigned unroll_factor, unsigned ahead) | |||
1233 | { | |||
1234 | struct mem_ref *ref; | |||
1235 | ||||
1236 | for (; groups; groups = groups->next) | |||
1237 | for (ref = groups->refs; ref; ref = ref->next) | |||
1238 | if (ref->issue_prefetch_p) | |||
1239 | issue_prefetch_ref (ref, unroll_factor, ahead); | |||
1240 | } | |||
1241 | ||||
1242 | /* Returns true if REF is a memory write for that a nontemporal store insn | |||
1243 | can be used. */ | |||
1244 | ||||
1245 | static bool | |||
1246 | nontemporal_store_p (struct mem_ref *ref) | |||
1247 | { | |||
1248 | machine_mode mode; | |||
1249 | enum insn_code code; | |||
1250 | ||||
1251 | /* REF must be a write that is not reused. We require it to be independent | |||
1252 | on all other memory references in the loop, as the nontemporal stores may | |||
1253 | be reordered with respect to other memory references. */ | |||
1254 | if (!ref->write_p | |||
1255 | || !ref->independent_p | |||
1256 | || ref->reuse_distance < L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024))) | |||
1257 | return false; | |||
1258 | ||||
1259 | /* Check that we have the storent instruction for the mode. */ | |||
1260 | mode = TYPE_MODE (TREE_TYPE (ref->mem))((((enum tree_code) ((tree_class_check ((((contains_struct_check ((ref->mem), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1260, __FUNCTION__))->typed.type)), (tcc_type), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1260, __FUNCTION__)))->base.code) == VECTOR_TYPE) ? vector_type_mode (((contains_struct_check ((ref->mem), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1260, __FUNCTION__))->typed.type)) : (((contains_struct_check ((ref->mem), (TS_TYPED), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1260, __FUNCTION__))->typed.type))->type_common.mode); | |||
1261 | if (mode == BLKmode((void) 0, E_BLKmode)) | |||
1262 | return false; | |||
1263 | ||||
1264 | code = optab_handler (storent_optab, mode); | |||
1265 | return code != CODE_FOR_nothing; | |||
1266 | } | |||
1267 | ||||
1268 | /* If REF is a nontemporal store, we mark the corresponding modify statement | |||
1269 | and return true. Otherwise, we return false. */ | |||
1270 | ||||
1271 | static bool | |||
1272 | mark_nontemporal_store (struct mem_ref *ref) | |||
1273 | { | |||
1274 | if (!nontemporal_store_p (ref)) | |||
1275 | return false; | |||
1276 | ||||
1277 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1278 | fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n", | |||
1279 | ref->group->uid, ref->uid); | |||
1280 | ||||
1281 | gimple_assign_set_nontemporal_move (ref->stmt, true); | |||
1282 | ref->storent_p = true; | |||
1283 | ||||
1284 | return true; | |||
1285 | } | |||
1286 | ||||
1287 | /* Issue a memory fence instruction after LOOP. */ | |||
1288 | ||||
1289 | static void | |||
1290 | emit_mfence_after_loop (class loop *loop) | |||
1291 | { | |||
1292 | auto_vec<edge> exits = get_loop_exit_edges (loop); | |||
1293 | edge exit; | |||
1294 | gcall *call; | |||
1295 | gimple_stmt_iterator bsi; | |||
1296 | unsigned i; | |||
1297 | ||||
1298 | FOR_EACH_VEC_ELT (exits, i, exit)for (i = 0; (exits).iterate ((i), &(exit)); ++(i)) | |||
1299 | { | |||
1300 | call = gimple_build_call (FENCE_FOLLOWING_MOVNTx86_mfence, 0); | |||
1301 | ||||
1302 | if (!single_pred_p (exit->dest) | |||
1303 | /* If possible, we prefer not to insert the fence on other paths | |||
1304 | in cfg. */ | |||
1305 | && !(exit->flags & EDGE_ABNORMAL)) | |||
1306 | split_loop_exit_edge (exit); | |||
1307 | bsi = gsi_after_labels (exit->dest); | |||
1308 | ||||
1309 | gsi_insert_before (&bsi, call, GSI_NEW_STMT); | |||
1310 | } | |||
1311 | } | |||
1312 | ||||
1313 | /* Returns true if we can use storent in loop, false otherwise. */ | |||
1314 | ||||
1315 | static bool | |||
1316 | may_use_storent_in_loop_p (class loop *loop) | |||
1317 | { | |||
1318 | bool ret = true; | |||
1319 | ||||
1320 | if (loop->inner != NULLnullptr) | |||
1321 | return false; | |||
1322 | ||||
1323 | /* If we must issue a mfence insn after using storent, check that there | |||
1324 | is a suitable place for it at each of the loop exits. */ | |||
1325 | if (FENCE_FOLLOWING_MOVNTx86_mfence != NULL_TREE(tree) nullptr) | |||
1326 | { | |||
1327 | auto_vec<edge> exits = get_loop_exit_edges (loop); | |||
1328 | unsigned i; | |||
1329 | edge exit; | |||
1330 | ||||
1331 | FOR_EACH_VEC_ELT (exits, i, exit)for (i = 0; (exits).iterate ((i), &(exit)); ++(i)) | |||
1332 | if ((exit->flags & EDGE_ABNORMAL) | |||
1333 | && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)(((cfun + 0))->cfg->x_exit_block_ptr)) | |||
1334 | ret = false; | |||
1335 | } | |||
1336 | ||||
1337 | return ret; | |||
1338 | } | |||
1339 | ||||
1340 | /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory | |||
1341 | references in the loop. Returns whether we inserted any mfence call. */ | |||
1342 | ||||
1343 | static bool | |||
1344 | mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups) | |||
1345 | { | |||
1346 | struct mem_ref *ref; | |||
1347 | bool any = false; | |||
1348 | ||||
1349 | if (!may_use_storent_in_loop_p (loop)) | |||
1350 | return false; | |||
1351 | ||||
1352 | for (; groups; groups = groups->next) | |||
1353 | for (ref = groups->refs; ref; ref = ref->next) | |||
1354 | any |= mark_nontemporal_store (ref); | |||
1355 | ||||
1356 | if (any && FENCE_FOLLOWING_MOVNTx86_mfence != NULL_TREE(tree) nullptr) | |||
1357 | { | |||
1358 | emit_mfence_after_loop (loop); | |||
1359 | return true; | |||
1360 | } | |||
1361 | return false; | |||
1362 | } | |||
1363 | ||||
1364 | /* Determines whether we can profitably unroll LOOP FACTOR times, and if | |||
1365 | this is the case, fill in DESC by the description of number of | |||
1366 | iterations. */ | |||
1367 | ||||
1368 | static bool | |||
1369 | should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc, | |||
1370 | unsigned factor) | |||
1371 | { | |||
1372 | if (!can_unroll_loop_p (loop, factor, desc)) | |||
1373 | return false; | |||
1374 | ||||
1375 | /* We only consider loops without control flow for unrolling. This is not | |||
1376 | a hard restriction -- tree_unroll_loop works with arbitrary loops | |||
1377 | as well; but the unrolling/prefetching is usually more profitable for | |||
1378 | loops consisting of a single basic block, and we want to limit the | |||
1379 | code growth. */ | |||
1380 | if (loop->num_nodes > 2) | |||
1381 | return false; | |||
1382 | ||||
1383 | return true; | |||
1384 | } | |||
1385 | ||||
1386 | /* Determine the coefficient by that unroll LOOP, from the information | |||
1387 | contained in the list of memory references REFS. Description of | |||
1388 | number of iterations of LOOP is stored to DESC. NINSNS is the number of | |||
1389 | insns of the LOOP. EST_NITER is the estimated number of iterations of | |||
1390 | the loop, or -1 if no estimate is available. */ | |||
1391 | ||||
1392 | static unsigned | |||
1393 | determine_unroll_factor (class loop *loop, struct mem_ref_group *refs, | |||
1394 | unsigned ninsns, class tree_niter_desc *desc, | |||
1395 | HOST_WIDE_INTlong est_niter) | |||
1396 | { | |||
1397 | unsigned upper_bound; | |||
1398 | unsigned nfactor, factor, mod_constraint; | |||
1399 | struct mem_ref_group *agp; | |||
1400 | struct mem_ref *ref; | |||
1401 | ||||
1402 | /* First check whether the loop is not too large to unroll. We ignore | |||
1403 | PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us | |||
1404 | from unrolling them enough to make exactly one cache line covered by each | |||
1405 | iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent | |||
1406 | us from unrolling the loops too many times in cases where we only expect | |||
1407 | gains from better scheduling and decreasing loop overhead, which is not | |||
1408 | the case here. */ | |||
1409 | upper_bound = param_max_unrolled_insnsglobal_options.x_param_max_unrolled_insns / ninsns; | |||
1410 | ||||
1411 | /* If we unrolled the loop more times than it iterates, the unrolled version | |||
1412 | of the loop would be never entered. */ | |||
1413 | if (est_niter >= 0 && est_niter < (HOST_WIDE_INTlong) upper_bound) | |||
1414 | upper_bound = est_niter; | |||
1415 | ||||
1416 | if (upper_bound <= 1) | |||
1417 | return 1; | |||
1418 | ||||
1419 | /* Choose the factor so that we may prefetch each cache just once, | |||
1420 | but bound the unrolling by UPPER_BOUND. */ | |||
1421 | factor = 1; | |||
1422 | for (agp = refs; agp; agp = agp->next) | |||
1423 | for (ref = agp->refs; ref; ref = ref->next) | |||
1424 | if (should_issue_prefetch_p (ref)) | |||
1425 | { | |||
1426 | mod_constraint = ref->prefetch_mod; | |||
1427 | nfactor = least_common_multiple (mod_constraint, factor); | |||
1428 | if (nfactor <= upper_bound) | |||
1429 | factor = nfactor; | |||
1430 | } | |||
1431 | ||||
1432 | if (!should_unroll_loop_p (loop, desc, factor)) | |||
1433 | return 1; | |||
1434 | ||||
1435 | return factor; | |||
1436 | } | |||
1437 | ||||
1438 | /* Returns the total volume of the memory references REFS, taking into account | |||
1439 | reuses in the innermost loop and cache line size. TODO -- we should also | |||
1440 | take into account reuses across the iterations of the loops in the loop | |||
1441 | nest. */ | |||
1442 | ||||
1443 | static unsigned | |||
1444 | volume_of_references (struct mem_ref_group *refs) | |||
1445 | { | |||
1446 | unsigned volume = 0; | |||
1447 | struct mem_ref_group *gr; | |||
1448 | struct mem_ref *ref; | |||
1449 | ||||
1450 | for (gr = refs; gr; gr = gr->next) | |||
1451 | for (ref = gr->refs; ref; ref = ref->next) | |||
1452 | { | |||
1453 | /* Almost always reuses another value? */ | |||
1454 | if (ref->prefetch_before != PREFETCH_ALL-1UL) | |||
1455 | continue; | |||
1456 | ||||
1457 | /* If several iterations access the same cache line, use the size of | |||
1458 | the line divided by this number. Otherwise, a cache line is | |||
1459 | accessed in each iteration. TODO -- in the latter case, we should | |||
1460 | take the size of the reference into account, rounding it up on cache | |||
1461 | line size multiple. */ | |||
1462 | volume += param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size / ref->prefetch_mod; | |||
1463 | } | |||
1464 | return volume; | |||
1465 | } | |||
1466 | ||||
1467 | /* Returns the volume of memory references accessed across VEC iterations of | |||
1468 | loops, whose sizes are described in the LOOP_SIZES array. N is the number | |||
1469 | of the loops in the nest (length of VEC and LOOP_SIZES vectors). */ | |||
1470 | ||||
1471 | static unsigned | |||
1472 | volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n) | |||
1473 | { | |||
1474 | unsigned i; | |||
1475 | ||||
1476 | for (i = 0; i < n; i++) | |||
1477 | if (vec[i] != 0) | |||
1478 | break; | |||
1479 | ||||
1480 | if (i == n) | |||
1481 | return 0; | |||
1482 | ||||
1483 | gcc_assert (vec[i] > 0)((void)(!(vec[i] > 0) ? fancy_abort ("/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1483, __FUNCTION__), 0 : 0)); | |||
1484 | ||||
1485 | /* We ignore the parts of the distance vector in subloops, since usually | |||
1486 | the numbers of iterations are much smaller. */ | |||
1487 | return loop_sizes[i] * vec[i]; | |||
1488 | } | |||
1489 | ||||
1490 | /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE | |||
1491 | at the position corresponding to the loop of the step. N is the depth | |||
1492 | of the considered loop nest, and, LOOP is its innermost loop. */ | |||
1493 | ||||
1494 | static void | |||
1495 | add_subscript_strides (tree access_fn, unsigned stride, | |||
1496 | HOST_WIDE_INTlong *strides, unsigned n, class loop *loop) | |||
1497 | { | |||
1498 | class loop *aloop; | |||
1499 | tree step; | |||
1500 | HOST_WIDE_INTlong astep; | |||
1501 | unsigned min_depth = loop_depth (loop) - n; | |||
1502 | ||||
1503 | while (TREE_CODE (access_fn)((enum tree_code) (access_fn)->base.code) == POLYNOMIAL_CHREC) | |||
1504 | { | |||
1505 | aloop = get_chrec_loop (access_fn); | |||
1506 | step = CHREC_RIGHT (access_fn)(*((const_cast<tree*> (tree_operand_check (((tree_check ((access_fn), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1506, __FUNCTION__, (POLYNOMIAL_CHREC)))), (1), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1506, __FUNCTION__))))); | |||
1507 | access_fn = CHREC_LEFT (access_fn)(*((const_cast<tree*> (tree_operand_check (((tree_check ((access_fn), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1507, __FUNCTION__, (POLYNOMIAL_CHREC)))), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1507, __FUNCTION__))))); | |||
1508 | ||||
1509 | if ((unsigned) loop_depth (aloop) <= min_depth) | |||
1510 | continue; | |||
1511 | ||||
1512 | if (tree_fits_shwi_p (step)) | |||
1513 | astep = tree_to_shwi (step); | |||
1514 | else | |||
1515 | astep = param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size; | |||
1516 | ||||
1517 | strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride; | |||
1518 | ||||
1519 | } | |||
1520 | } | |||
1521 | ||||
1522 | /* Returns the volume of memory references accessed between two consecutive | |||
1523 | self-reuses of the reference DR. We consider the subscripts of DR in N | |||
1524 | loops, and LOOP_SIZES contains the volumes of accesses in each of the | |||
1525 | loops. LOOP is the innermost loop of the current loop nest. */ | |||
1526 | ||||
1527 | static unsigned | |||
1528 | self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n, | |||
1529 | class loop *loop) | |||
1530 | { | |||
1531 | tree stride, access_fn; | |||
1532 | HOST_WIDE_INTlong *strides, astride; | |||
1533 | vec<tree> access_fns; | |||
1534 | tree ref = DR_REF (dr)(dr)->ref; | |||
1535 | unsigned i, ret = ~0u; | |||
1536 | ||||
1537 | /* In the following example: | |||
1538 | ||||
1539 | for (i = 0; i < N; i++) | |||
1540 | for (j = 0; j < N; j++) | |||
1541 | use (a[j][i]); | |||
1542 | the same cache line is accessed each N steps (except if the change from | |||
1543 | i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse, | |||
1544 | we cannot rely purely on the results of the data dependence analysis. | |||
1545 | ||||
1546 | Instead, we compute the stride of the reference in each loop, and consider | |||
1547 | the innermost loop in that the stride is less than cache size. */ | |||
1548 | ||||
1549 | strides = XCNEWVEC (HOST_WIDE_INT, n)((long *) xcalloc ((n), sizeof (long))); | |||
1550 | access_fns = DR_ACCESS_FNS (dr)(dr)->indices.access_fns; | |||
1551 | ||||
1552 | FOR_EACH_VEC_ELT (access_fns, i, access_fn)for (i = 0; (access_fns).iterate ((i), &(access_fn)); ++( i)) | |||
1553 | { | |||
1554 | /* Keep track of the reference corresponding to the subscript, so that we | |||
1555 | know its stride. */ | |||
1556 | while (handled_component_p (ref) && TREE_CODE (ref)((enum tree_code) (ref)->base.code) != ARRAY_REF) | |||
1557 | ref = TREE_OPERAND (ref, 0)(*((const_cast<tree*> (tree_operand_check ((ref), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1557, __FUNCTION__))))); | |||
1558 | ||||
1559 | if (TREE_CODE (ref)((enum tree_code) (ref)->base.code) == ARRAY_REF) | |||
1560 | { | |||
1561 | stride = TYPE_SIZE_UNIT (TREE_TYPE (ref))((tree_class_check ((((contains_struct_check ((ref), (TS_TYPED ), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1561, __FUNCTION__))->typed.type)), (tcc_type), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1561, __FUNCTION__))->type_common.size_unit); | |||
1562 | if (tree_fits_uhwi_p (stride)) | |||
1563 | astride = tree_to_uhwi (stride); | |||
1564 | else | |||
1565 | astride = param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size; | |||
1566 | ||||
1567 | ref = TREE_OPERAND (ref, 0)(*((const_cast<tree*> (tree_operand_check ((ref), (0), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 1567, __FUNCTION__))))); | |||
1568 | } | |||
1569 | else | |||
1570 | astride = 1; | |||
1571 | ||||
1572 | add_subscript_strides (access_fn, astride, strides, n, loop); | |||
1573 | } | |||
1574 | ||||
1575 | for (i = n; i-- > 0; ) | |||
1576 | { | |||
1577 | unsigned HOST_WIDE_INTlong s; | |||
1578 | ||||
1579 | s = strides[i] < 0 ? -strides[i] : strides[i]; | |||
1580 | ||||
1581 | if (s < (unsigned) param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size | |||
1582 | && (loop_sizes[i] | |||
1583 | > (unsigned) (L1_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l1_cache_size * 1024)) / NONTEMPORAL_FRACTION16))) | |||
1584 | { | |||
1585 | ret = loop_sizes[i]; | |||
1586 | break; | |||
1587 | } | |||
1588 | } | |||
1589 | ||||
1590 | free (strides); | |||
1591 | return ret; | |||
1592 | } | |||
1593 | ||||
1594 | /* Determines the distance till the first reuse of each reference in REFS | |||
1595 | in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other | |||
1596 | memory references in the loop. Return false if the analysis fails. */ | |||
1597 | ||||
1598 | static bool | |||
1599 | determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs, | |||
1600 | bool no_other_refs) | |||
1601 | { | |||
1602 | class loop *nest, *aloop; | |||
1603 | vec<data_reference_p> datarefs = vNULL; | |||
1604 | vec<ddr_p> dependences = vNULL; | |||
1605 | struct mem_ref_group *gr; | |||
1606 | struct mem_ref *ref, *refb; | |||
1607 | auto_vec<loop_p> vloops; | |||
1608 | unsigned *loop_data_size; | |||
1609 | unsigned i, j, n; | |||
1610 | unsigned volume, dist, adist; | |||
1611 | HOST_WIDE_INTlong vol; | |||
1612 | data_reference_p dr; | |||
1613 | ddr_p dep; | |||
1614 | ||||
1615 | if (loop->inner) | |||
1616 | return true; | |||
1617 | ||||
1618 | /* Find the outermost loop of the loop nest of loop (we require that | |||
1619 | there are no sibling loops inside the nest). */ | |||
1620 | nest = loop; | |||
1621 | while (1) | |||
1622 | { | |||
1623 | aloop = loop_outer (nest); | |||
1624 | ||||
1625 | if (aloop == current_loops((cfun + 0)->x_current_loops)->tree_root | |||
1626 | || aloop->inner->next) | |||
1627 | break; | |||
1628 | ||||
1629 | nest = aloop; | |||
1630 | } | |||
1631 | ||||
1632 | /* For each loop, determine the amount of data accessed in each iteration. | |||
1633 | We use this to estimate whether the reference is evicted from the | |||
1634 | cache before its reuse. */ | |||
1635 | find_loop_nest (nest, &vloops); | |||
1636 | n = vloops.length (); | |||
1637 | loop_data_size = XNEWVEC (unsigned, n)((unsigned *) xmalloc (sizeof (unsigned) * (n))); | |||
1638 | volume = volume_of_references (refs); | |||
1639 | i = n; | |||
1640 | while (i-- != 0) | |||
1641 | { | |||
1642 | loop_data_size[i] = volume; | |||
1643 | /* Bound the volume by the L2 cache size, since above this bound, | |||
1644 | all dependence distances are equivalent. */ | |||
1645 | if (volume > L2_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l2_cache_size * 1024))) | |||
1646 | continue; | |||
1647 | ||||
1648 | aloop = vloops[i]; | |||
1649 | vol = estimated_stmt_executions_int (aloop); | |||
1650 | if (vol == -1) | |||
1651 | vol = expected_loop_iterations (aloop); | |||
1652 | volume *= vol; | |||
1653 | } | |||
1654 | ||||
1655 | /* Prepare the references in the form suitable for data dependence | |||
1656 | analysis. We ignore unanalyzable data references (the results | |||
1657 | are used just as a heuristics to estimate temporality of the | |||
1658 | references, hence we do not need to worry about correctness). */ | |||
1659 | for (gr = refs; gr; gr = gr->next) | |||
1660 | for (ref = gr->refs; ref; ref = ref->next) | |||
1661 | { | |||
1662 | dr = create_data_ref (loop_preheader_edge (nest), | |||
1663 | loop_containing_stmt (ref->stmt), | |||
1664 | ref->mem, ref->stmt, !ref->write_p, false); | |||
1665 | ||||
1666 | if (dr) | |||
1667 | { | |||
1668 | ref->reuse_distance = volume; | |||
1669 | dr->aux = ref; | |||
1670 | datarefs.safe_push (dr); | |||
1671 | } | |||
1672 | else | |||
1673 | no_other_refs = false; | |||
1674 | } | |||
1675 | ||||
1676 | FOR_EACH_VEC_ELT (datarefs, i, dr)for (i = 0; (datarefs).iterate ((i), &(dr)); ++(i)) | |||
1677 | { | |||
1678 | dist = self_reuse_distance (dr, loop_data_size, n, loop); | |||
1679 | ref = (struct mem_ref *) dr->aux; | |||
1680 | if (ref->reuse_distance > dist) | |||
1681 | ref->reuse_distance = dist; | |||
1682 | ||||
1683 | if (no_other_refs) | |||
1684 | ref->independent_p = true; | |||
1685 | } | |||
1686 | ||||
1687 | if (!compute_all_dependences (datarefs, &dependences, vloops, true)) | |||
1688 | return false; | |||
1689 | ||||
1690 | FOR_EACH_VEC_ELT (dependences, i, dep)for (i = 0; (dependences).iterate ((i), &(dep)); ++(i)) | |||
1691 | { | |||
1692 | if (DDR_ARE_DEPENDENT (dep)(dep)->are_dependent == chrec_knownglobal_trees[TI_CHREC_KNOWN]) | |||
1693 | continue; | |||
1694 | ||||
1695 | ref = (struct mem_ref *) DDR_A (dep)(dep)->a->aux; | |||
1696 | refb = (struct mem_ref *) DDR_B (dep)(dep)->b->aux; | |||
1697 | ||||
1698 | if (DDR_ARE_DEPENDENT (dep)(dep)->are_dependent == chrec_dont_knowglobal_trees[TI_CHREC_DONT_KNOW] | |||
1699 | || DDR_COULD_BE_INDEPENDENT_P (dep)(dep)->could_be_independent_p | |||
1700 | || DDR_NUM_DIST_VECTS (dep)(((dep)->dist_vects).length ()) == 0) | |||
1701 | { | |||
1702 | /* If the dependence cannot be analyzed, assume that there might be | |||
1703 | a reuse. */ | |||
1704 | dist = 0; | |||
1705 | ||||
1706 | ref->independent_p = false; | |||
1707 | refb->independent_p = false; | |||
1708 | } | |||
1709 | else | |||
1710 | { | |||
1711 | /* The distance vectors are normalized to be always lexicographically | |||
1712 | positive, hence we cannot tell just from them whether DDR_A comes | |||
1713 | before DDR_B or vice versa. However, it is not important, | |||
1714 | anyway -- if DDR_A is close to DDR_B, then it is either reused in | |||
1715 | DDR_B (and it is not nontemporal), or it reuses the value of DDR_B | |||
1716 | in cache (and marking it as nontemporal would not affect | |||
1717 | anything). */ | |||
1718 | ||||
1719 | dist = volume; | |||
1720 | for (j = 0; j < DDR_NUM_DIST_VECTS (dep)(((dep)->dist_vects).length ()); j++) | |||
1721 | { | |||
1722 | adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j)((dep)->dist_vects)[j], | |||
1723 | loop_data_size, n); | |||
1724 | ||||
1725 | /* If this is a dependence in the innermost loop (i.e., the | |||
1726 | distances in all superloops are zero) and it is not | |||
1727 | the trivial self-dependence with distance zero, record that | |||
1728 | the references are not completely independent. */ | |||
1729 | if (lambda_vector_zerop (DDR_DIST_VECT (dep, j)((dep)->dist_vects)[j], n - 1) | |||
1730 | && (ref != refb | |||
1731 | || DDR_DIST_VECT (dep, j)((dep)->dist_vects)[j][n-1] != 0)) | |||
1732 | { | |||
1733 | ref->independent_p = false; | |||
1734 | refb->independent_p = false; | |||
1735 | } | |||
1736 | ||||
1737 | /* Ignore accesses closer than | |||
1738 | L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, | |||
1739 | so that we use nontemporal prefetches e.g. if single memory | |||
1740 | location is accessed several times in a single iteration of | |||
1741 | the loop. */ | |||
1742 | if (adist < L1_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l1_cache_size * 1024)) / NONTEMPORAL_FRACTION16) | |||
1743 | continue; | |||
1744 | ||||
1745 | if (adist < dist) | |||
1746 | dist = adist; | |||
1747 | } | |||
1748 | } | |||
1749 | ||||
1750 | if (ref->reuse_distance > dist) | |||
1751 | ref->reuse_distance = dist; | |||
1752 | if (refb->reuse_distance > dist) | |||
1753 | refb->reuse_distance = dist; | |||
1754 | } | |||
1755 | ||||
1756 | free_dependence_relations (dependences); | |||
1757 | free_data_refs (datarefs); | |||
1758 | free (loop_data_size); | |||
1759 | ||||
1760 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1761 | { | |||
1762 | fprintf (dump_file, "Reuse distances:\n"); | |||
1763 | for (gr = refs; gr; gr = gr->next) | |||
1764 | for (ref = gr->refs; ref; ref = ref->next) | |||
1765 | fprintf (dump_file, " reference %u:%u distance %u\n", | |||
1766 | ref->group->uid, ref->uid, ref->reuse_distance); | |||
1767 | } | |||
1768 | ||||
1769 | return true; | |||
1770 | } | |||
1771 | ||||
1772 | /* Determine whether or not the trip count to ahead ratio is too small based | |||
1773 | on prefitablility consideration. | |||
1774 | AHEAD: the iteration ahead distance, | |||
1775 | EST_NITER: the estimated trip count. */ | |||
1776 | ||||
1777 | static bool | |||
1778 | trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INTlong est_niter) | |||
1779 | { | |||
1780 | /* Assume trip count to ahead ratio is big enough if the trip count could not | |||
1781 | be estimated at compile time. */ | |||
1782 | if (est_niter < 0) | |||
1783 | return false; | |||
1784 | ||||
1785 | if (est_niter < (HOST_WIDE_INTlong) (TRIP_COUNT_TO_AHEAD_RATIO4 * ahead)) | |||
1786 | { | |||
1787 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1788 | fprintf (dump_file, | |||
1789 | "Not prefetching -- loop estimated to roll only %d times\n", | |||
1790 | (int) est_niter); | |||
1791 | return true; | |||
1792 | } | |||
1793 | ||||
1794 | return false; | |||
1795 | } | |||
1796 | ||||
1797 | /* Determine whether or not the number of memory references in the loop is | |||
1798 | reasonable based on the profitablity and compilation time considerations. | |||
1799 | NINSNS: estimated number of instructions in the loop, | |||
1800 | MEM_REF_COUNT: total number of memory references in the loop. */ | |||
1801 | ||||
1802 | static bool | |||
1803 | mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count) | |||
1804 | { | |||
1805 | int insn_to_mem_ratio; | |||
1806 | ||||
1807 | if (mem_ref_count == 0) | |||
1808 | return false; | |||
1809 | ||||
1810 | /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis | |||
1811 | (compute_all_dependences) have high costs based on quadratic complexity. | |||
1812 | To avoid huge compilation time, we give up prefetching if mem_ref_count | |||
1813 | is too large. */ | |||
1814 | if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP200) | |||
1815 | return false; | |||
1816 | ||||
1817 | /* Prefetching improves performance by overlapping cache missing | |||
1818 | memory accesses with CPU operations. If the loop does not have | |||
1819 | enough CPU operations to overlap with memory operations, prefetching | |||
1820 | won't give a significant benefit. One approximate way of checking | |||
1821 | this is to require the ratio of instructions to memory references to | |||
1822 | be above a certain limit. This approximation works well in practice. | |||
1823 | TODO: Implement a more precise computation by estimating the time | |||
1824 | for each CPU or memory op in the loop. Time estimates for memory ops | |||
1825 | should account for cache misses. */ | |||
1826 | insn_to_mem_ratio = ninsns / mem_ref_count; | |||
1827 | ||||
1828 | if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratioglobal_options.x_param_prefetch_min_insn_to_mem_ratio) | |||
1829 | { | |||
1830 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1831 | fprintf (dump_file, | |||
1832 | "Not prefetching -- instruction to memory reference ratio (%d) too small\n", | |||
1833 | insn_to_mem_ratio); | |||
1834 | return false; | |||
1835 | } | |||
1836 | ||||
1837 | return true; | |||
1838 | } | |||
1839 | ||||
1840 | /* Determine whether or not the instruction to prefetch ratio in the loop is | |||
1841 | too small based on the profitablity consideration. | |||
1842 | NINSNS: estimated number of instructions in the loop, | |||
1843 | PREFETCH_COUNT: an estimate of the number of prefetches, | |||
1844 | UNROLL_FACTOR: the factor to unroll the loop if prefetching. */ | |||
1845 | ||||
1846 | static bool | |||
1847 | insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, | |||
1848 | unsigned unroll_factor) | |||
1849 | { | |||
1850 | int insn_to_prefetch_ratio; | |||
1851 | ||||
1852 | /* Prefetching most likely causes performance degradation when the instruction | |||
1853 | to prefetch ratio is too small. Too many prefetch instructions in a loop | |||
1854 | may reduce the I-cache performance. | |||
1855 | (unroll_factor * ninsns) is used to estimate the number of instructions in | |||
1856 | the unrolled loop. This implementation is a bit simplistic -- the number | |||
1857 | of issued prefetch instructions is also affected by unrolling. So, | |||
1858 | prefetch_mod and the unroll factor should be taken into account when | |||
1859 | determining prefetch_count. Also, the number of insns of the unrolled | |||
1860 | loop will usually be significantly smaller than the number of insns of the | |||
1861 | original loop * unroll_factor (at least the induction variable increases | |||
1862 | and the exit branches will get eliminated), so it might be better to use | |||
1863 | tree_estimate_loop_size + estimated_unrolled_size. */ | |||
1864 | insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count; | |||
1865 | if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratioglobal_options.x_param_min_insn_to_prefetch_ratio) | |||
1866 | { | |||
1867 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1868 | fprintf (dump_file, | |||
1869 | "Not prefetching -- instruction to prefetch ratio (%d) too small\n", | |||
1870 | insn_to_prefetch_ratio); | |||
1871 | return true; | |||
1872 | } | |||
1873 | ||||
1874 | return false; | |||
1875 | } | |||
1876 | ||||
1877 | ||||
1878 | /* Issue prefetch instructions for array references in LOOP. Returns | |||
1879 | true if the LOOP was unrolled and updates NEED_LC_SSA_UPDATE if we need | |||
1880 | to update SSA for virtual operands and LC SSA for a split edge. */ | |||
1881 | ||||
1882 | static bool | |||
1883 | loop_prefetch_arrays (class loop *loop, bool &need_lc_ssa_update) | |||
1884 | { | |||
1885 | struct mem_ref_group *refs; | |||
1886 | unsigned ahead, ninsns, time, unroll_factor; | |||
1887 | HOST_WIDE_INTlong est_niter; | |||
1888 | class tree_niter_desc desc; | |||
1889 | bool unrolled = false, no_other_refs; | |||
1890 | unsigned prefetch_count; | |||
1891 | unsigned mem_ref_count; | |||
1892 | ||||
1893 | if (optimize_loop_nest_for_size_p (loop)) | |||
1894 | { | |||
1895 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1896 | fprintf (dump_file, " ignored (cold area)\n"); | |||
1897 | return false; | |||
1898 | } | |||
1899 | ||||
1900 | /* FIXME: the time should be weighted by the probabilities of the blocks in | |||
1901 | the loop body. */ | |||
1902 | time = tree_num_loop_insns (loop, &eni_time_weights); | |||
1903 | if (time == 0) | |||
1904 | return false; | |||
1905 | ||||
1906 | ahead = (param_prefetch_latencyglobal_options.x_param_prefetch_latency + time - 1) / time; | |||
1907 | est_niter = estimated_stmt_executions_int (loop); | |||
1908 | if (est_niter == -1) | |||
1909 | est_niter = likely_max_stmt_executions_int (loop); | |||
1910 | ||||
1911 | /* Prefetching is not likely to be profitable if the trip count to ahead | |||
1912 | ratio is too small. */ | |||
1913 | if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) | |||
1914 | return false; | |||
1915 | ||||
1916 | ninsns = tree_num_loop_insns (loop, &eni_size_weights); | |||
1917 | ||||
1918 | /* Step 1: gather the memory references. */ | |||
1919 | refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); | |||
1920 | ||||
1921 | /* Give up prefetching if the number of memory references in the | |||
1922 | loop is not reasonable based on profitablity and compilation time | |||
1923 | considerations. */ | |||
1924 | if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count)) | |||
1925 | goto fail; | |||
1926 | ||||
1927 | /* Step 2: estimate the reuse effects. */ | |||
1928 | prune_by_reuse (refs); | |||
1929 | ||||
1930 | if (nothing_to_prefetch_p (refs)) | |||
1931 | goto fail; | |||
1932 | ||||
1933 | if (!determine_loop_nest_reuse (loop, refs, no_other_refs)) | |||
1934 | goto fail; | |||
1935 | ||||
1936 | /* Step 3: determine unroll factor. */ | |||
1937 | unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, | |||
1938 | est_niter); | |||
1939 | ||||
1940 | /* Estimate prefetch count for the unrolled loop. */ | |||
1941 | prefetch_count = estimate_prefetch_count (refs, unroll_factor); | |||
1942 | if (prefetch_count == 0) | |||
1943 | goto fail; | |||
1944 | ||||
1945 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1946 | fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " | |||
1947 | HOST_WIDE_INT_PRINT_DEC"%" "l" "d" "\n" | |||
1948 | "insn count %d, mem ref count %d, prefetch count %d\n", | |||
1949 | ahead, unroll_factor, est_niter, | |||
1950 | ninsns, mem_ref_count, prefetch_count); | |||
1951 | ||||
1952 | /* Prefetching is not likely to be profitable if the instruction to prefetch | |||
1953 | ratio is too small. */ | |||
1954 | if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count, | |||
1955 | unroll_factor)) | |||
1956 | goto fail; | |||
1957 | ||||
1958 | need_lc_ssa_update |= mark_nontemporal_stores (loop, refs); | |||
1959 | ||||
1960 | /* Step 4: what to prefetch? */ | |||
1961 | if (!schedule_prefetches (refs, unroll_factor, ahead)) | |||
1962 | goto fail; | |||
1963 | ||||
1964 | /* Step 5: unroll the loop. TODO -- peeling of first and last few | |||
1965 | iterations so that we do not issue superfluous prefetches. */ | |||
1966 | if (unroll_factor != 1) | |||
1967 | { | |||
1968 | tree_unroll_loop (loop, unroll_factor, &desc); | |||
1969 | unrolled = true; | |||
1970 | } | |||
1971 | ||||
1972 | /* Step 6: issue the prefetches. */ | |||
1973 | issue_prefetches (refs, unroll_factor, ahead); | |||
1974 | ||||
1975 | fail: | |||
1976 | release_mem_refs (refs); | |||
1977 | return unrolled; | |||
1978 | } | |||
1979 | ||||
1980 | /* Issue prefetch instructions for array references in loops. */ | |||
1981 | ||||
1982 | unsigned int | |||
1983 | tree_ssa_prefetch_arrays (void) | |||
1984 | { | |||
1985 | bool unrolled = false; | |||
1986 | bool need_lc_ssa_update = false; | |||
1987 | int todo_flags = 0; | |||
1988 | ||||
1989 | if (!targetm.have_prefetch () | |||
1990 | /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. | |||
1991 | -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part | |||
1992 | of processor costs and i486 does not have prefetch, but | |||
1993 | -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */ | |||
1994 | || PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size == 0) | |||
1995 | return 0; | |||
1996 | ||||
1997 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
1998 | { | |||
1999 | fprintf (dump_file, "Prefetching parameters:\n"); | |||
2000 | fprintf (dump_file, " simultaneous prefetches: %d\n", | |||
2001 | param_simultaneous_prefetchesglobal_options.x_param_simultaneous_prefetches); | |||
2002 | fprintf (dump_file, " prefetch latency: %d\n", param_prefetch_latencyglobal_options.x_param_prefetch_latency); | |||
2003 | fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size); | |||
2004 | fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", | |||
2005 | L1_CACHE_SIZE_BYTES((unsigned) (global_options.x_param_l1_cache_size * 1024)) / param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size, | |||
2006 | param_l1_cache_sizeglobal_options.x_param_l1_cache_size); | |||
2007 | fprintf (dump_file, " L1 cache line size: %d\n", | |||
2008 | param_l1_cache_line_sizeglobal_options.x_param_l1_cache_line_size); | |||
2009 | fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_sizeglobal_options.x_param_l2_cache_size); | |||
2010 | fprintf (dump_file, " min insn-to-prefetch ratio: %d \n", | |||
2011 | param_min_insn_to_prefetch_ratioglobal_options.x_param_min_insn_to_prefetch_ratio); | |||
2012 | fprintf (dump_file, " min insn-to-mem ratio: %d \n", | |||
2013 | param_prefetch_min_insn_to_mem_ratioglobal_options.x_param_prefetch_min_insn_to_mem_ratio); | |||
2014 | fprintf (dump_file, "\n"); | |||
2015 | } | |||
2016 | ||||
2017 | initialize_original_copy_tables (); | |||
2018 | ||||
2019 | if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) | |||
2020 | { | |||
2021 | tree type = build_function_type_list (void_type_nodeglobal_trees[TI_VOID_TYPE], | |||
2022 | const_ptr_type_nodeglobal_trees[TI_CONST_PTR_TYPE], NULL_TREE(tree) nullptr); | |||
2023 | tree decl = add_builtin_function ("__builtin_prefetch", type, | |||
2024 | BUILT_IN_PREFETCH, BUILT_IN_NORMAL, | |||
2025 | NULLnullptr, NULL_TREE(tree) nullptr); | |||
2026 | DECL_IS_NOVOPS (decl)((tree_check ((decl), "/buildworker/marxinbox-gcc-clang-static-analyzer/build/gcc/tree-ssa-loop-prefetch.cc" , 2026, __FUNCTION__, (FUNCTION_DECL)))->function_decl.novops_flag ) = true; | |||
2027 | set_builtin_decl (BUILT_IN_PREFETCH, decl, false); | |||
2028 | } | |||
2029 | ||||
2030 | for (auto loop : loops_list (cfun(cfun + 0), LI_FROM_INNERMOST)) | |||
2031 | { | |||
2032 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
2033 | fprintf (dump_file, "Processing loop %d:\n", loop->num); | |||
2034 | ||||
2035 | unrolled |= loop_prefetch_arrays (loop, need_lc_ssa_update); | |||
2036 | ||||
2037 | if (dump_file && (dump_flags & TDF_DETAILS)) | |||
2038 | fprintf (dump_file, "\n\n"); | |||
2039 | } | |||
2040 | ||||
2041 | if (need_lc_ssa_update) | |||
2042 | rewrite_into_loop_closed_ssa (NULLnullptr, TODO_update_ssa_only_virtuals(1 << 14)); | |||
2043 | ||||
2044 | if (unrolled) | |||
2045 | { | |||
2046 | scev_reset (); | |||
2047 | todo_flags |= TODO_cleanup_cfg(1 << 5); | |||
2048 | } | |||
2049 | ||||
2050 | free_original_copy_tables (); | |||
2051 | return todo_flags; | |||
2052 | } | |||
2053 | ||||
2054 | /* Prefetching. */ | |||
2055 | ||||
2056 | namespace { | |||
2057 | ||||
2058 | const pass_data pass_data_loop_prefetch = | |||
2059 | { | |||
2060 | GIMPLE_PASS, /* type */ | |||
2061 | "aprefetch", /* name */ | |||
2062 | OPTGROUP_LOOP, /* optinfo_flags */ | |||
2063 | TV_TREE_PREFETCH, /* tv_id */ | |||
2064 | ( PROP_cfg(1 << 3) | PROP_ssa(1 << 5) ), /* properties_required */ | |||
2065 | 0, /* properties_provided */ | |||
2066 | 0, /* properties_destroyed */ | |||
2067 | 0, /* todo_flags_start */ | |||
2068 | 0, /* todo_flags_finish */ | |||
2069 | }; | |||
2070 | ||||
2071 | class pass_loop_prefetch : public gimple_opt_pass | |||
2072 | { | |||
2073 | public: | |||
2074 | pass_loop_prefetch (gcc::context *ctxt) | |||
2075 | : gimple_opt_pass (pass_data_loop_prefetch, ctxt) | |||
2076 | {} | |||
2077 | ||||
2078 | /* opt_pass methods: */ | |||
2079 | bool gate (function *) final override | |||
2080 | { | |||
2081 | return flag_prefetch_loop_arraysglobal_options.x_flag_prefetch_loop_arrays > 0; | |||
2082 | } | |||
2083 | unsigned int execute (function *) final override; | |||
2084 | ||||
2085 | }; // class pass_loop_prefetch | |||
2086 | ||||
2087 | unsigned int | |||
2088 | pass_loop_prefetch::execute (function *fun) | |||
2089 | { | |||
2090 | if (number_of_loops (fun) <= 1) | |||
2091 | return 0; | |||
2092 | ||||
2093 | if ((PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size & (PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size - 1)) != 0) | |||
2094 | { | |||
2095 | static bool warned = false; | |||
2096 | ||||
2097 | if (!warned) | |||
2098 | { | |||
2099 | warning (OPT_Wdisabled_optimization, | |||
2100 | "%<l1-cache-size%> parameter is not a power of two %d", | |||
2101 | PREFETCH_BLOCKglobal_options.x_param_l1_cache_line_size); | |||
2102 | warned = true; | |||
2103 | } | |||
2104 | return 0; | |||
2105 | } | |||
2106 | ||||
2107 | return tree_ssa_prefetch_arrays (); | |||
2108 | } | |||
2109 | ||||
2110 | } // anon namespace | |||
2111 | ||||
2112 | gimple_opt_pass * | |||
2113 | make_pass_loop_prefetch (gcc::context *ctxt) | |||
2114 | { | |||
2115 | return new pass_loop_prefetch (ctxt); | |||
2116 | } | |||
2117 | ||||
2118 |