From ba5a396be51cd232d6647f70f7d792c6dcc63223 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 24 Mar 2022 18:08:38 -0700 Subject: [PATCH 001/113] tools/vm/page_owner_sort.c: sort by stacktrace before culling The contents of page_owner have changed to include more information than the stack trace. On a modern kernel, the blocks look like Page allocated via order 0, mask 0x0(), pid 1, ts 165564237 ns, free_ts 0 ns register_early_stack+0x4b/0x90 init_page_owner+0x39/0x250 kernel_init_freeable+0x11e/0x242 kernel_init+0x16/0x130 Sorting by the contents of .txt will result in almost no repeated pages, as the pid, ts, and free_ts will almost never be the same. Instead, sort by the contents of the stack trace, which we assume to be whatever is after the first line. [seanga2@gmail.com: fix NULL-pointer dereference when comparing stack traces] Link: https://lkml.kernel.org/r/20211125162653.1855958-1-seanga2@gmail.com Link: https://lkml.kernel.org/r/20211124193709.1805776-1-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ebb84a9c7310..5582d8454d3bd 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -23,6 +23,7 @@ struct block_list { char *txt; + char *stacktrace; int len; int num; int page_num; @@ -51,11 +52,11 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } -static int compare_txt(const void *p1, const void *p2) +static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return strcmp(l1->txt, l2->txt); + return strcmp(l1->stacktrace, l2->stacktrace); } static int compare_num(const void *p1, const void *p2) @@ -121,6 +122,7 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -199,7 +201,7 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_txt); + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -211,7 +213,7 @@ int main(int argc, char **argv) for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].txt, list[i].txt) != 0) { + strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From 82f5ebc2beb3c803bf17bead4e38c1d5bf2c8d78 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 24 Mar 2022 18:08:41 -0700 Subject: [PATCH 002/113] tools/vm/page_owner_sort.c: support sorting by stack trace This adds the ability to sort by stacktraces. This is helpful when comparing multiple dumps of page_owner taken at different times, since blocks will not be reordered if they were allocated/free'd. Link: https://lkml.kernel.org/r/20211124193709.1805776-2-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Zhenliang Wei Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 5582d8454d3bd..1b2acf02d3cd6 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -29,7 +29,6 @@ struct block_list { int page_num; }; -static int sort_by_memory; static regex_t order_pattern; static struct block_list *list; static int list_size; @@ -134,13 +133,16 @@ static void add_list(char *buf, int len) static void usage(void) { - printf("Usage: ./page_owner_sort [-m] \n" - "-m Sort by total memory. If this option is unset, sort by times\n" + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m Sort by total memory.\n" + "-s Sort by the stack trace.\n" + "-t Sort by times (default).\n" ); } int main(int argc, char **argv) { + int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; char *buf; int ret, i, count; @@ -149,10 +151,16 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "m")) != -1) + while ((opt = getopt(argc, argv, "mst")) != -1) switch (opt) { case 'm': - sort_by_memory = 1; + cmp = compare_page_num; + break; + case 's': + cmp = compare_stacktrace; + break; + case 't': + cmp = compare_num; break; default: usage(); @@ -221,10 +229,7 @@ int main(int argc, char **argv) } } - if (sort_by_memory) - qsort(list2, count, sizeof(list[0]), compare_page_num); - else - qsort(list2, count, sizeof(list[0]), compare_num); + qsort(list2, count, sizeof(list[0]), cmp); for (i = 0; i < count; i++) fprintf(fout, "%d times, %d pages:\n%s\n", From cd75ea0e32620c622aeaef531ef7b9f59c67f7a6 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Thu, 24 Mar 2022 18:08:44 -0700 Subject: [PATCH 003/113] tools/vm/page_owner_sort.c: add switch between culling by stacktrace and txt Culling by comparing stacktrace would casue loss of some information. For example, if there exists 2 blocks which have the same stacktrace and the different head info Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 Page allocated via order 0, mask 0x108c48(...), pid 61806, ts 1354113726046100 ns, free_ts 1354104926841400 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 After culling, it would be like this 2 times, 2 pages: Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 The info of second block missed. So, add -c to turn on culling by stacktrace. By default, it will cull by txt. Link: https://lkml.kernel.org/r/20211129145658.2491-1-zhangyinan2019@email.szu.edu.cn Signed-off-by: Yinan Zhang Cc: Changhee Han Cc: Sean Anderson Cc: Stephen Rothwell Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 1b2acf02d3cd6..492be7f752c04 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -51,6 +51,13 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -137,12 +144,14 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-c cull by comparing stacktrace instead of total block.\n" ); } int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; + int cull_st = 0; FILE *fin, *fout; char *buf; int ret, i, count; @@ -151,7 +160,7 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "mst")) != -1) + while ((opt = getopt(argc, argv, "mstc")) != -1) switch (opt) { case 'm': cmp = compare_page_num; @@ -162,6 +171,9 @@ int main(int argc, char **argv) case 't': cmp = compare_num; break; + case 'c': + cull_st = 1; + break; default: usage(); exit(1); @@ -209,7 +221,10 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + if (cull_st == 1) + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + else + qsort(list, list_size, sizeof(list[0]), compare_txt); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -219,9 +234,11 @@ int main(int argc, char **argv) printf("culling\n"); + long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0; + for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { + strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From 8f9c447e2e2b53c2db4fac85fc42ecada8b39e52 Mon Sep 17 00:00:00 2001 From: Chongxi Zhao Date: Thu, 24 Mar 2022 18:08:47 -0700 Subject: [PATCH 004/113] tools/vm/page_owner_sort.c: support sorting pid and time When viewing the page owner information, we expect that the information can be sorted by PID, so that we can quickly combine PID with the program to check the information together. We also expect that the information can be sorted by time. Time sorting helps to view the running status of the program according to the time interval when the program hangs up. Finally, we hope to pass the page_ owner_ Sort. C can reduce part of the output and only output the plate information whose memory has not been released, which can make us locate the problem of the program faster. Therefore, the following adjustments have been made: 1. Add the static functions search_pattern and check_regcomp to improve the cleanliness. 2. Add member attributes and their corresponding sorting methods. In terms of comparison time, int will overflow because the data of ull is too large, so the ternary operator is used 3. Add the -f parameter to filter out the information of blocks whose memory has not been released Link: https://lkml.kernel.org/r/20211206165653.5093-1-zhaochongxi2019@email.szu.edu.cn Signed-off-by: Chongxi Zhao Reviewed-by: Sean Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 177 +++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 29 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 492be7f752c04..c9fedc1806d50 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -20,6 +20,7 @@ #include #include #include +#include struct block_list { char *txt; @@ -27,9 +28,15 @@ struct block_list { int len; int num; int page_num; + pid_t pid; + __u64 ts_nsec; + __u64 free_ts_nsec; }; static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -79,34 +86,124 @@ static int compare_page_num(const void *p1, const void *p2) return l2->page_num - l1->page_num; } -static int get_page_num(char *buf) +static int compare_pid(const void *p1, const void *p2) { - int err, val_len, order_val; - char order_str[4] = {0}; - char *endptr; + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; regmatch_t pmatch[2]; - err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - printf("no order pattern in %s\n", buf); - return 0; + printf("no matching pattern in %s\n", buf); + return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - if (val_len > 2) /* max_order should not exceed 2 digits */ - goto wrong_order; - memcpy(order_str, buf + pmatch[1].rm_so, val_len); + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static void check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + printf("Invalid pattern %s code %d\n", regex, err); + exit(1); + } +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + search_pattern(&order_pattern, order_str, buf); errno = 0; order_val = strtol(order_str, &endptr, 10); - if (errno != 0 || endptr == order_str || *endptr != '\0') - goto wrong_order; + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + printf("wrong order in follow buf:\n%s\n", buf); + return 0; + } return 1 << order_val; +} -wrong_order: - printf("wrong order in follow buf:\n%s\n", buf); - return 0; +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + printf("wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + printf("wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + printf("wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; } static void add_list(char *buf, int len) @@ -129,6 +226,11 @@ static void add_list(char *buf, int len) memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + list[list_size].pid = get_pid(buf); + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -144,6 +246,9 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-p Sort by pid.\n" + "-a Sort by memory allocate time.\n" + "-r Sort by memory release time.\n" "-c cull by comparing stacktrace instead of total block.\n" ); } @@ -152,28 +257,40 @@ int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; int cull_st = 0; + int filter = 0; FILE *fin, *fout; char *buf; int ret, i, count; struct block_list *list2; struct stat st; - int err; int opt; - while ((opt = getopt(argc, argv, "mstc")) != -1) + while ((opt = getopt(argc, argv, "acfmprst")) != -1) switch (opt) { + case 'a': + cmp = compare_ts; + break; + case 'c': + cull_st = 1; + break; + case 'f': + filter = 1; + break; case 'm': cmp = compare_page_num; break; + case 'p': + cmp = compare_pid; + break; + case 'r': + cmp = compare_free_ts; + break; case 's': cmp = compare_stacktrace; break; case 't': cmp = compare_num; break; - case 'c': - cull_st = 1; - break; default: usage(); exit(1); @@ -192,13 +309,10 @@ int main(int argc, char **argv) exit(1); } - err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); - if (err != 0 || order_pattern.re_nsub != 1) { - printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", - argv[0], err); - exit(1); - } - + check_regcomp(&order_pattern, "order\\s*([0-9]*),"); + check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); + check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); + check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -248,10 +362,15 @@ int main(int argc, char **argv) qsort(list2, count, sizeof(list[0]), cmp); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + if (filter == 1 && list2[i].free_ts_nsec != 0) + continue; fprintf(fout, "%d times, %d pages:\n%s\n", list2[i].num, list2[i].page_num, list2[i].txt); - + } regfree(&order_pattern); + regfree(&pid_pattern); + regfree(&ts_nsec_pattern); + regfree(&free_ts_nsec_pattern); return 0; } From e7a3f6776905b4e0b61692add3d0808315379c89 Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Thu, 24 Mar 2022 18:08:50 -0700 Subject: [PATCH 005/113] tools/vm/page_owner_sort.c: two trivial fixes 1) There is an unused variable. It's better to delete it. 2) One case is missing in the usage(). Link: https://lkml.kernel.org/r/20211213164518.2461-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c9fedc1806d50..284a5070402c3 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -41,8 +41,6 @@ static struct block_list *list; static int list_size; static int max_size; -struct block_list *block_head; - int read_block(char *buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; @@ -249,7 +247,8 @@ static void usage(void) "-p Sort by pid.\n" "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" - "-c cull by comparing stacktrace instead of total block.\n" + "-c Cull by comparing stacktrace instead of total block.\n" + "-f Filter out the information of blocks whose memory has not been released.\n" ); } From 41ed64347b5db8f6c9359d4f87f768db1a83bd79 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 24 Mar 2022 18:08:53 -0700 Subject: [PATCH 006/113] tools/vm/page_owner_sort.c: delete invalid duplicate code I noticed that there is two invalid lines of duplicate code. It's better to delete it. Link: https://lkml.kernel.org/r/20211213095743.3630-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Mark Brown Cc: Sean Anderson Cc: Zhenliang Wei Cc: Tang Bin Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 284a5070402c3..c8ec2d6b314de 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -227,8 +227,6 @@ static void add_list(char *buf, int len) list[list_size].pid = get_pid(buf); list[list_size].ts_nsec = get_ts_nsec(buf); list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); From 57f2b54a937987847e666aaf56d207aa457adee6 Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Thu, 24 Mar 2022 18:08:56 -0700 Subject: [PATCH 007/113] Documentation/vm/page_owner.rst: update the documentation Update the documentation of ``page_owner``. [akpm@linux-foundation.org: small grammatical tweaks] Link: https://lkml.kernel.org/r/20211214134736.2569-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Georgi Djakov Cc: Liam Mark Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 905555e3e4836..6420097a1f7d1 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -97,7 +97,7 @@ Usage The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the times. + and pages of buf, and finally sorts them according to the parameter(s). See the result about who allocated each page in the ``sorted_page_owner.txt``. General output:: @@ -107,4 +107,23 @@ Usage // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the pages nums of buf, use the ``-m`` parameter. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: + + fundamental function: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + + additional function: + + Cull: + -c Cull by comparing stacktrace instead of total block. + + Filter: + -f Filter out the information of blocks whose memory has not been released. From 2e9449856b94bb89bc826695bf8d6d21547e7209 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 24 Mar 2022 18:08:59 -0700 Subject: [PATCH 008/113] Documentation/vm/page_owner.rst: fix unexpected indentation warns Fix Unexpected indentation warns in page_owner: Documentation/vm/page_owner.rst:92: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:96: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:107: WARNING: Unexpected indentation. Link: https://lkml.kernel.org/r/20211215001929.47866-1-skhan@linuxfoundation.org Signed-off-by: Shuah Khan Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 6420097a1f7d1..6591e518819f2 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -89,11 +89,11 @@ Usage Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times @@ -104,7 +104,7 @@ Usage XXX times, XXX pages: Page allocated via order XXX, ... - // Detailed stack + // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. If you want to sort by the page nums of buf, use the ``-m`` parameter. From ef62c8ff1de437ec2e0582205c1293c7dbbc4484 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Mar 2022 18:09:02 -0700 Subject: [PATCH 009/113] lib/vsprintf: avoid redundant work with 0 size Patch series "mm/page_owner: Extend page_owner to show memcg information", v4. While debugging the constant increase in percpu memory consumption on a system that spawned large number of containers, it was found that a lot of offline mem_cgroup structures remained in place without being freed. Further investigation indicated that those mem_cgroup structures were pinned by some pages. In order to find out what those pages are, the existing page_owner debugging tool is extended to show memory cgroup information and whether those memcgs are offline or not. With the enhanced page_owner tool, the following is a typical page that pinned the mem_cgroup structure in my test case: Page allocated via order 0, mask 0x1100cca(GFP_HIGHUSER_MOVABLE), pid 162970 (podman), ts 1097761405537 ns, free_ts 1097760838089 ns PFN 1925700 type Movable Block 3761 type Movable Flags 0x17ffffc00c001c(uptodate|dirty|lru|reclaim|swapbacked|node=0|zone=2|lastcpupid=0x1fffff) prep_new_page+0xac/0xe0 get_page_from_freelist+0x1327/0x14d0 __alloc_pages+0x191/0x340 alloc_pages_vma+0x84/0x250 shmem_alloc_page+0x3f/0x90 shmem_alloc_and_acct_page+0x76/0x1c0 shmem_getpage_gfp+0x281/0x940 shmem_write_begin+0x36/0xe0 generic_perform_write+0xed/0x1d0 __generic_file_write_iter+0xdc/0x1b0 generic_file_write_iter+0x5d/0xb0 new_sync_write+0x11f/0x1b0 vfs_write+0x1ba/0x2a0 ksys_write+0x59/0xd0 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Charged to offline memcg libpod-conmon-15e4f9c758422306b73b2dd99f9d50a5ea53cbb16b4a13a2c2308a4253cc0ec8. So the page was not freed because it was part of a shmem segment. That is useful information that can help users to diagnose similar problems. With cgroup v1, /proc/cgroups can be read to find out the total number of memory cgroups (online + offline). With cgroup v2, the cgroup.stat of the root cgroup can be read to find the number of dying cgroups (most likely pinned by dying memcgs). The page_owner feature is not supposed to be enabled for production system due to its memory overhead. However, if it is suspected that dying memcgs are increasing over time, a test environment with page_owner enabled can then be set up with appropriate workload for further analysis on what may be causing the increasing number of dying memcgs. This patch (of 4): For *scnprintf(), vsnprintf() is always called even if the input size is 0. That is a waste of time, so just return 0 in this case. Note that vsnprintf() will never return -1 to indicate an error. So skipping the call to vsnprintf() when size is 0 will have no functional impact at all. Link: https://lkml.kernel.org/r/20220202203036.744010-1-longman@redhat.com Link: https://lkml.kernel.org/r/20220202203036.744010-2-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Reviewed-by: Sergey Senozhatsky Acked-by: Roman Gushchin Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Petr Mladek Cc: Steven Rostedt (Google) Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Ira Weiny Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 53fe73a48adfe..40d26a07a1331 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2906,13 +2906,15 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; + if (unlikely(!size)) + return 0; + i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; - if (size != 0) - return size - 1; - return 0; + + return size - 1; } EXPORT_SYMBOL(vscnprintf); From 3ebc439761273274ea00258da84d997841f01e72 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Mar 2022 18:09:05 -0700 Subject: [PATCH 010/113] mm/page_owner: use scnprintf() to avoid excessive buffer overrun check The snprintf() function can return a length greater than the given input size. That will require a check for buffer overrun after each invocation of snprintf(). scnprintf(), on the other hand, will never return a greater length. By using scnprintf() in selected places, we can avoid some buffer overrun checks except after stack_depot_snprint() and after the last snprintf(). Link: https://lkml.kernel.org/r/20220202203036.744010-3-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Reviewed-by: Sergey Senozhatsky Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Andy Shevchenko Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 99e360df94652..28dac73e0542d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -338,19 +338,16 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (!kbuf) return -ENOMEM; - ret = snprintf(kbuf, count, + ret = scnprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); - if (ret >= count) - goto err; - /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], @@ -358,19 +355,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[pageblock_mt], &page->flags); - if (ret >= count) - goto err; - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); - if (ret >= count) - goto err; } ret += snprintf(kbuf + ret, count - ret, "\n"); From fcf8935832b86d3437f00e732c6d0d4d2819d6a9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Mar 2022 18:09:08 -0700 Subject: [PATCH 011/113] mm/page_owner: print memcg information It was found that a number of offline memcgs were not freed because they were pinned by some charged pages that were present. Even "echo 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These offline but not freed memcgs tend to increase in number over time with the side effect that percpu memory consumption as shown in /proc/meminfo also increases over time. In order to find out more information about those pages that pin offline memcgs, the page_owner feature is extended to print memory cgroup information especially whether the cgroup is offline or not. RCU read lock is taken when memcg is being accessed to make sure that it won't be freed. Link: https://lkml.kernel.org/r/20220202203036.744010-4-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Acked-by: Roman Gushchin Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Roman Gushchin Cc: Andy Shevchenko Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 28dac73e0542d..f7820357e4d4c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "internal.h" @@ -325,6 +326,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, seq_putc(m, '\n'); } +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, @@ -365,6 +405,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migrate_reason_names[page_owner->last_migrate_reason]); } + ret = print_page_owner_memcg(kbuf, count, ret, page); + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; From 865ed6a3278654ce4a55eb74c5283eeb82ad4699 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Mar 2022 18:09:11 -0700 Subject: [PATCH 012/113] mm/page_owner: record task command name The page_owner information currently includes the pid of the calling task. That is useful as long as the task is still running. Otherwise, the number is meaningless. To have more information about the allocating tasks that had exited by the time the page_owner information is retrieved, we need to store the command name of the task. Add a new comm field into page_owner structure to store the command name and display it when the page_owner information is retrieved. Link: https://lkml.kernel.org/r/20220202203036.744010-5-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Rafael Aquini Cc: Andy Shevchenko Cc: David Rientjes Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Rapoport Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index f7820357e4d4c..d56afa9c792ed 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -29,6 +29,7 @@ struct page_owner { depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; pid_t pid; }; @@ -165,6 +166,8 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->ts_nsec = local_clock(); + strlcpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -232,6 +235,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); /* * We don't clear the bit on the old folio as it's going to be freed @@ -379,10 +383,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = scnprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); @@ -449,9 +454,10 @@ void __dump_page_owner(const struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->pid, page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) From bf215eab785a30756ea1e53b62a5638d1177a795 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 24 Mar 2022 18:09:14 -0700 Subject: [PATCH 013/113] mm/page_owner.c: record tgid In a single-threaded process, the pid in kernel task_struct is the same as the tgid, which can mark the process of page allocation. But in a multithreaded process, only the task_struct of the thread leader has the same pid as tgid, and the pids of other threads are different from tgid. Therefore, tgid is recorded to provide effective information for debugging and data statistics of multithreaded programs. This can also be achieved by observing the task name (executable file name) for a specific process. However, when the same program is started multiple times, the task name is the same and the tgid is different. Therefore, in the debugging of multi-threaded programs, combined with the task name and tgid, more accurate runtime information of a certain run of the program can be obtained. Link: https://lkml.kernel.org/r/20220219180450.2399-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Waiman Long Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index d56afa9c792ed..fb3a05fdebdbf 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -31,6 +31,7 @@ struct page_owner { u64 free_ts_nsec; char comm[TASK_COMM_LEN]; pid_t pid; + pid_t tgid; }; static bool page_owner_enabled = false; @@ -165,6 +166,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; + page_owner->tgid = current->tgid; page_owner->ts_nsec = local_clock(); strlcpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); @@ -233,6 +235,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; + new_page_owner->tgid = old_page_owner->tgid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; strcpy(new_page_owner->comm, old_page_owner->comm); @@ -383,11 +386,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = scnprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d (%s), ts %llu ns, free_ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->comm, page_owner->ts_nsec, - page_owner->free_ts_nsec); + page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); @@ -454,10 +457,10 @@ void __dump_page_owner(const struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d (%s), ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->comm, page_owner->ts_nsec, - page_owner->free_ts_nsec); + page_owner->pid, page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) From 49e495a015e9cc127728b0f353a15d16da138fb8 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 24 Mar 2022 18:09:17 -0700 Subject: [PATCH 014/113] tools/vm/page_owner_sort.c: fix the instructions for use I noticed a discrepancy between the usage method and the code logic. If we enable the -f option, it should be "Filter out the information of blocks whose memory has been released". Link: https://lkml.kernel.org/r/20220219143106.2805-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Stephen Rothwell Cc: Sean Anderson Cc: Muchun Song Cc: Zhenliang Wei Cc: Tang Bin Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c8ec2d6b314de..79d69c3b84edb 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -246,7 +246,7 @@ static void usage(void) "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" "-c Cull by comparing stacktrace instead of total block.\n" - "-f Filter out the information of blocks whose memory has not been released.\n" + "-f Filter out the information of blocks whose memory has been released.\n" ); } From 59d7cb27d528eae2f060d548dcd14d292b19fda3 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:20 -0700 Subject: [PATCH 015/113] tools/vm/page_owner_sort.c: fix comments Two adjustments are made: 1. Correct a grammatical error: replace the "what" in "Do the job what you want to debug" with "that". 2. Replace "has not been" with "has been" in the description of the -f option: According to Commit b1c9ba071e7d ("tools/vm/page_owner_sort.c: fix the instructions for use"), the description of the "-f" option is "Filter out the information of blocks whose memory has been released." Link: https://lkml.kernel.org/r/20220301151438.166118-1-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Stephen Rothwell Cc: Yinan Zhang Cc: Yixuan Cao Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 6591e518819f2..2ddb632d847b7 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -78,7 +78,7 @@ Usage 2) Enable page owner: add "page_owner=on" to boot cmdline. -3) Do the job what you want to debug +3) Do the job that you want to debug. 4) Analyze information from page owner:: @@ -126,4 +126,4 @@ Usage -c Cull by comparing stacktrace instead of total block. Filter: - -f Filter out the information of blocks whose memory has not been released. + -f Filter out the information of blocks whose memory has been released. From 56465a38305f22bca3469c2738d7320a0c333e72 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:23 -0700 Subject: [PATCH 016/113] tools/vm/page_owner_sort.c: add a security check Add a security check after using malloc() to allocate memory. Link: https://lkml.kernel.org/r/20220301151438.166118-2-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Stephen Rothwell Cc: Yinan Zhang Cc: Yixuan Cao Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 79d69c3b84edb..69fb6ca7c0b7f 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -217,7 +217,13 @@ static void add_list(char *buf, int len) printf("max_size too small??\n"); exit(1); } + list[list_size].txt = malloc(len+1); + if (!list[list_size].txt) { + printf("Out of memory\n"); + exit(1); + } + list[list_size].len = len; list[list_size].num = 1; list[list_size].page_num = get_page_num(buf); From cf3c2c8678a0b21052d00b64d7a5903f3b1d1197 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:26 -0700 Subject: [PATCH 017/113] tools/vm/page_owner_sort.c: support sorting by tgid and update documentation When the "page owner" information is read, the information sorted by TGID is expected. As a result, the following adjustments have been made: 1. Add a new -P option to sort the information of blocks by TGID in ascending order. 2. Adjust the order of member variables in block_list strust to avoid one 4 byte hole. 3. Add -P option explanation in the document. Link: https://lkml.kernel.org/r/20220301151438.166118-3-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Stephen Rothwell Cc: Yixuan Cao Cc: Zhenliang Wei Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 1 + tools/vm/page_owner_sort.c | 40 ++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 2ddb632d847b7..941543a797fe9 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -116,6 +116,7 @@ Usage -a Sort by memory allocation time. -m Sort by total memory. -p Sort by pid. + -P Sort by tgid. -r Sort by memory release time. -s Sort by stack trace. -t Sort by times (default). diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 69fb6ca7c0b7f..d166f2f900eb1 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -25,16 +25,18 @@ struct block_list { char *txt; char *stacktrace; + __u64 ts_nsec; + __u64 free_ts_nsec; int len; int num; int page_num; pid_t pid; - __u64 ts_nsec; - __u64 free_ts_nsec; + pid_t tgid; }; static regex_t order_pattern; static regex_t pid_pattern; +static regex_t tgid_pattern; static regex_t ts_nsec_pattern; static regex_t free_ts_nsec_pattern; static struct block_list *list; @@ -91,6 +93,13 @@ static int compare_pid(const void *p1, const void *p2) return l1->pid - l2->pid; } +static int compare_tgid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->tgid - l2->tgid; +} + static int compare_ts(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -170,6 +179,24 @@ static pid_t get_pid(char *buf) } +static pid_t get_tgid(char *buf) +{ + pid_t tgid; + char tgid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&tgid_pattern, tgid_str, buf); + errno = 0; + tgid = strtol(tgid_str, &endptr, 10); + if (errno != 0 || endptr == tgid_str || *endptr != '\0') { + printf("wrong/invalid tgid in follow buf:\n%s\n", buf); + return -1; + } + + return tgid; + +} + static __u64 get_ts_nsec(char *buf) { __u64 ts_nsec; @@ -231,6 +258,7 @@ static void add_list(char *buf, int len) list[list_size].txt[len] = 0; list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; list[list_size].pid = get_pid(buf); + list[list_size].tgid = get_tgid(buf); list[list_size].ts_nsec = get_ts_nsec(buf); list[list_size].free_ts_nsec = get_free_ts_nsec(buf); list_size++; @@ -249,6 +277,7 @@ static void usage(void) "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" "-p Sort by pid.\n" + "-P Sort by tgid.\n" "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" "-c Cull by comparing stacktrace instead of total block.\n" @@ -268,7 +297,7 @@ int main(int argc, char **argv) struct stat st; int opt; - while ((opt = getopt(argc, argv, "acfmprst")) != -1) + while ((opt = getopt(argc, argv, "acfmprstP")) != -1) switch (opt) { case 'a': cmp = compare_ts; @@ -294,6 +323,9 @@ int main(int argc, char **argv) case 't': cmp = compare_num; break; + case 'P': + cmp = compare_tgid; + break; default: usage(); exit(1); @@ -314,6 +346,7 @@ int main(int argc, char **argv) check_regcomp(&order_pattern, "order\\s*([0-9]*),"); check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); + check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "); check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); @@ -373,6 +406,7 @@ int main(int argc, char **argv) } regfree(&order_pattern); regfree(&pid_pattern); + regfree(&tgid_pattern); regfree(&ts_nsec_pattern); regfree(&free_ts_nsec_pattern); return 0; From 578d8f2761a828f6c3409d0931b036bf3a999246 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:29 -0700 Subject: [PATCH 018/113] tools/vm/page_owner_sort: fix three trivival places The following adjustments are made: 1. Instead of using another array to cull the blocks after sorting, reuse the old array. So there is no need to malloc a new array. 2. When enabling '-f' option to filter out the blocks which have been released, only add those have not been released in the list, rather than add all of blocks in the list and then do the filtering when printing the result. 3. When enabling '-c' option to cull the blocks by comparing stacktrace, print the stacetrace rather than the total block. Link: https://lkml.kernel.org/r/20220306030640.43054-1-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Cc: Sean Anderson Cc: Stephen Rothwell Cc: Yixuan Cao Cc: Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index d166f2f900eb1..b8d2867b5b184 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -42,6 +42,8 @@ static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; +static int cull_st; +static int filter; int read_block(char *buf, int buf_size, FILE *fin) { @@ -245,6 +247,9 @@ static void add_list(char *buf, int len) exit(1); } + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + if (filter == 1 && list[list_size].free_ts_nsec != 0) + return; list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { printf("Out of memory\n"); @@ -257,10 +262,11 @@ static void add_list(char *buf, int len) memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + if (*list[list_size].stacktrace == '\n') + list[list_size].stacktrace++; list[list_size].pid = get_pid(buf); list[list_size].tgid = get_tgid(buf); list[list_size].ts_nsec = get_ts_nsec(buf); - list[list_size].free_ts_nsec = get_free_ts_nsec(buf); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -288,12 +294,9 @@ static void usage(void) int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; - int cull_st = 0; - int filter = 0; FILE *fin, *fout; char *buf; int ret, i, count; - struct block_list *list2; struct stat st; int opt; @@ -376,11 +379,7 @@ int main(int argc, char **argv) else qsort(list, list_size, sizeof(list[0]), compare_txt); - list2 = malloc(sizeof(*list) * list_size); - if (!list2) { - printf("Out of memory\n"); - exit(1); - } + printf("culling\n"); @@ -388,21 +387,23 @@ int main(int argc, char **argv) for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { - list2[count++] = list[i]; + strcmp(*(&list[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { + list[count++] = list[i]; } else { - list2[count-1].num += list[i].num; - list2[count-1].page_num += list[i].page_num; + list[count-1].num += list[i].num; + list[count-1].page_num += list[i].page_num; } } - qsort(list2, count, sizeof(list[0]), cmp); + qsort(list, count, sizeof(list[0]), cmp); for (i = 0; i < count; i++) { - if (filter == 1 && list2[i].free_ts_nsec != 0) - continue; - fprintf(fout, "%d times, %d pages:\n%s\n", - list2[i].num, list2[i].page_num, list2[i].txt); + if (cull_st == 0) + fprintf(fout, "%d times, %d pages:\n%s\n", + list[i].num, list[i].page_num, list[i].txt); + else + fprintf(fout, "%d times, %d pages:\n%s\n", + list[i].num, list[i].page_num, list[i].stacktrace); } regfree(&order_pattern); regfree(&pid_pattern); From 194d52d771b8f7cf5bcf0f81f87dd76e492c355c Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:31 -0700 Subject: [PATCH 019/113] tools/vm/page_owner_sort: support for sorting by task command name When viewing page owner information, we may also need to the block to be sorted by task command name. Therefore, the following adjustments are made: 1. Add a member variable to record task command name of block. 2. Add a new -n option to sort the information of blocks by task command name. 3. Add -n option explanation in the document. Link: https://lkml.kernel.org/r/20220306030640.43054-2-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Stephen Rothwell Cc: Sean Anderson Cc: Yixuan Cao Cc: Zhenliang Wei Cc: Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 1 + tools/vm/page_owner_sort.c | 35 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 941543a797fe9..d658436a5e090 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -117,6 +117,7 @@ Usage -m Sort by total memory. -p Sort by pid. -P Sort by tgid. + -n Sort by task command name. -r Sort by memory release time. -s Sort by stack trace. -t Sort by times (default). diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index b8d2867b5b184..e508abd8f6653 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -24,6 +24,7 @@ struct block_list { char *txt; + char *comm; // task command name char *stacktrace; __u64 ts_nsec; __u64 free_ts_nsec; @@ -37,6 +38,7 @@ struct block_list { static regex_t order_pattern; static regex_t pid_pattern; static regex_t tgid_pattern; +static regex_t comm_pattern; static regex_t ts_nsec_pattern; static regex_t free_ts_nsec_pattern; static struct block_list *list; @@ -102,6 +104,13 @@ static int compare_tgid(const void *p1, const void *p2) return l1->tgid - l2->tgid; } +static int compare_comm(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->comm, l2->comm); +} + static int compare_ts(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -145,6 +154,7 @@ static void check_regcomp(regex_t *pattern, const char *regex) } # define FIELD_BUFF 25 +# define TASK_COMM_LEN 16 static int get_page_num(char *buf) { @@ -233,6 +243,22 @@ static __u64 get_free_ts_nsec(char *buf) return free_ts_nsec; } +static char *get_comm(char *buf) +{ + char *comm_str = malloc(TASK_COMM_LEN); + + memset(comm_str, 0, TASK_COMM_LEN); + + search_pattern(&comm_pattern, comm_str, buf); + errno = 0; + if (errno != 0) { + printf("wrong comm in follow buf:\n%s\n", buf); + return NULL; + } + + return comm_str; +} + static void add_list(char *buf, int len) { if (list_size != 0 && @@ -266,6 +292,7 @@ static void add_list(char *buf, int len) list[list_size].stacktrace++; list[list_size].pid = get_pid(buf); list[list_size].tgid = get_tgid(buf); + list[list_size].comm = get_comm(buf); list[list_size].ts_nsec = get_ts_nsec(buf); list_size++; if (list_size % 1000 == 0) { @@ -284,6 +311,7 @@ static void usage(void) "-t Sort by times (default).\n" "-p Sort by pid.\n" "-P Sort by tgid.\n" + "-n Sort by task command name.\n" "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" "-c Cull by comparing stacktrace instead of total block.\n" @@ -300,7 +328,7 @@ int main(int argc, char **argv) struct stat st; int opt; - while ((opt = getopt(argc, argv, "acfmprstP")) != -1) + while ((opt = getopt(argc, argv, "acfmnprstP")) != -1) switch (opt) { case 'a': cmp = compare_ts; @@ -329,6 +357,9 @@ int main(int argc, char **argv) case 'P': cmp = compare_tgid; break; + case 'n': + cmp = compare_comm; + break; default: usage(); exit(1); @@ -350,6 +381,7 @@ int main(int argc, char **argv) check_regcomp(&order_pattern, "order\\s*([0-9]*),"); check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "); + check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"); check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); @@ -408,6 +440,7 @@ int main(int argc, char **argv) regfree(&order_pattern); regfree(&pid_pattern); regfree(&tgid_pattern); + regfree(&comm_pattern); regfree(&ts_nsec_pattern); regfree(&free_ts_nsec_pattern); return 0; From 8ea8613a616aff184df9e3ea2d3ec39d90832867 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:34 -0700 Subject: [PATCH 020/113] tools/vm/page_owner_sort.c: support for selecting by PID, TGID or task command name When viewing page owner information, we may also need to select the blocks by PID, TGID or task command name, which helps to get more accurate page allocation information as needed. Therefore, following adjustments are made: 1. Add three new options, including --pid, --tgid and --name, to support the selection of information blocks by a specific pid, tgid and task command name. In addtion, multiple options are allowed to be used at the same time. ./page_owner_sort [input] [output] --pid ./page_owner_sort [input] [output] --tgid ./page_owner_sort [input] [output] --name Assuming a scenario when a multi-threaded program, ./demo (PID = 5280), is running, and ./demo creates a child process (PID = 5281). $ps PID TTY TIME CMD 5215 pts/0 00:00:00 bash 5280 pts/0 00:00:00 ./demo 5281 pts/0 00:00:00 ./demo 5282 pts/0 00:00:00 ps It would be better to filter out the records with tgid=5280 and the task name "demo" when debugging the parent process, and the specific usage is ./page_owner_sort [input] [output] --tgid 5280 --name demo 2. Add explanations of three new options, including --pid, --tgid and --name, to the document. This work is coauthored by Shenghong Han , Yixuan Cao , Yinan Zhang , Chongxi Zhao , Yuhong Feng . Link: https://lkml.kernel.org/r/1646835223-7584-1-git-send-email-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Sean Anderson Cc: Stephen Rothwell Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 5 ++ tools/vm/page_owner_sort.c | 120 +++++++++++++++++++++++++------- 2 files changed, 98 insertions(+), 27 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index d658436a5e090..3dd31fe06c050 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -129,3 +129,8 @@ Usage Filter: -f Filter out the information of blocks whose memory has been released. + + Select: + --pid Select by pid. + --tgid Select by tgid. + --name Select by task command name. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index e508abd8f6653..e873eff84462e 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -21,6 +21,12 @@ #include #include #include +#include + +#define bool int +#define true 1 +#define false 0 +#define TASK_COMM_LEN 16 struct block_list { char *txt; @@ -34,7 +40,18 @@ struct block_list { pid_t pid; pid_t tgid; }; - +enum FILTER_BIT { + FILTER_UNRELEASE = 1<<1, + FILTER_PID = 1<<2, + FILTER_TGID = 1<<3, + FILTER_TASK_COMM_NAME = 1<<4 +}; +struct filter_condition { + pid_t tgid; + pid_t pid; + char comm[TASK_COMM_LEN]; +}; +static struct filter_condition fc; static regex_t order_pattern; static regex_t pid_pattern; static regex_t tgid_pattern; @@ -154,7 +171,6 @@ static void check_regcomp(regex_t *pattern, const char *regex) } # define FIELD_BUFF 25 -# define TASK_COMM_LEN 16 static int get_page_num(char *buf) { @@ -259,11 +275,30 @@ static char *get_comm(char *buf) return comm_str; } +static bool is_need(char *buf) +{ + if ((filter & FILTER_UNRELEASE) != 0 && get_free_ts_nsec(buf) != 0) + return false; + if ((filter & FILTER_PID) != 0 && get_pid(buf) != fc.pid) + return false; + if ((filter & FILTER_TGID) != 0 && get_tgid(buf) != fc.tgid) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_TASK_COMM_NAME) != 0 && + strncmp(comm, fc.comm, TASK_COMM_LEN) != 0) { + free(comm); + return false; + } + return true; +} + static void add_list(char *buf, int len) { if (list_size != 0 && - len == list[list_size-1].len && - memcmp(buf, list[list_size-1].txt, len) == 0) { + len == list[list_size-1].len && + memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; list[list_size-1].page_num += get_page_num(buf); return; @@ -272,28 +307,27 @@ static void add_list(char *buf, int len) printf("max_size too small??\n"); exit(1); } - - list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - if (filter == 1 && list[list_size].free_ts_nsec != 0) + if (!is_need(buf)) return; + list[list_size].pid = get_pid(buf); + list[list_size].tgid = get_tgid(buf); + list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { printf("Out of memory\n"); exit(1); } - + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; list[list_size].len = len; list[list_size].num = 1; list[list_size].page_num = get_page_num(buf); - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; if (*list[list_size].stacktrace == '\n') list[list_size].stacktrace++; - list[list_size].pid = get_pid(buf); - list[list_size].tgid = get_tgid(buf); - list[list_size].comm = get_comm(buf); list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -306,16 +340,19 @@ static void add_list(char *buf, int len) static void usage(void) { printf("Usage: ./page_owner_sort [OPTIONS] \n" - "-m Sort by total memory.\n" - "-s Sort by the stack trace.\n" - "-t Sort by times (default).\n" - "-p Sort by pid.\n" - "-P Sort by tgid.\n" - "-n Sort by task command name.\n" - "-a Sort by memory allocate time.\n" - "-r Sort by memory release time.\n" - "-c Cull by comparing stacktrace instead of total block.\n" - "-f Filter out the information of blocks whose memory has been released.\n" + "-m\t\tSort by total memory.\n" + "-s\t\tSort by the stack trace.\n" + "-t\t\tSort by times (default).\n" + "-p\t\tSort by pid.\n" + "-P\t\tSort by tgid.\n" + "-n\t\tSort by task command name.\n" + "-a\t\tSort by memory allocate time.\n" + "-r\t\tSort by memory release time.\n" + "-c\t\tCull by comparing stacktrace instead of total block.\n" + "-f\t\tFilter out the information of blocks whose memory has been released.\n" + "--pid \tSelect by pid. This selects the information of blocks whose process ID number equals to .\n" + "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID number equals to .\n" + "--name \n\t\tSelect by command name. This selects the information of blocks whose command name identical to .\n" ); } @@ -323,12 +360,18 @@ int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; - char *buf; + char *buf, *endptr; int ret, i, count; struct stat st; int opt; - - while ((opt = getopt(argc, argv, "acfmnprstP")) != -1) + struct option longopts[] = { + { "pid", required_argument, NULL, 1 }, + { "tgid", required_argument, NULL, 2 }, + { "name", required_argument, NULL, 3 }, + { 0, 0, 0, 0}, + }; + + while ((opt = getopt_long(argc, argv, "acfmnprstP", longopts, NULL)) != -1) switch (opt) { case 'a': cmp = compare_ts; @@ -337,7 +380,7 @@ int main(int argc, char **argv) cull_st = 1; break; case 'f': - filter = 1; + filter = filter | FILTER_UNRELEASE; break; case 'm': cmp = compare_page_num; @@ -360,6 +403,29 @@ int main(int argc, char **argv) case 'n': cmp = compare_comm; break; + case 1: + filter = filter | FILTER_PID; + errno = 0; + fc.pid = strtol(optarg, &endptr, 10); + if (errno != 0 || endptr == optarg || *endptr != '\0') { + printf("wrong/invalid pid in from the command line:%s\n", optarg); + exit(1); + } + break; + case 2: + filter = filter | FILTER_TGID; + errno = 0; + fc.tgid = strtol(optarg, &endptr, 10); + if (errno != 0 || endptr == optarg || *endptr != '\0') { + printf("wrong/invalid tgid in from the command line:%s\n", optarg); + exit(1); + } + break; + case 3: + filter = filter | FILTER_TASK_COMM_NAME; + strncpy(fc.comm, optarg, TASK_COMM_LEN); + fc.comm[TASK_COMM_LEN-1] = '\0'; + break; default: usage(); exit(1); From 9c8a0a8e599f4a949ef18207ba495fb557dd1016 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 24 Mar 2022 18:09:38 -0700 Subject: [PATCH 021/113] tools/vm/page_owner_sort.c: support for user-defined culling rules When viewing page owner information, we may want to cull blocks of information with our own rules. So it is important to enhance culling function to provide the support for customizing culling rules. Therefore, following adjustments are made: 1. Add --cull option to support the culling of blocks of information with user-defined culling rules. ./page_owner_sort --cull= ./page_owner_sort --cull is a single argument in the form of a comma-separated list to specify individual culling rules, by the sequence of keys k1,k2, .... Mixed use of abbreviated and complete-form of keys is allowed. For reference, please see the document(Documentation/vm/page_owner.rst). Now, assuming two blocks in the input file are as follows: Page allocated via order 0, mask xxxx, pid 1, tgid 1 (task_name_demo) PFN xxxx prep_new_page+0xd0/0xf8 get_page_from_freelist+0x4a0/0x1290 __alloc_pages+0x168/0x340 alloc_pages+0xb0/0x158 Page allocated via order 0, mask xxxx, pid 32, tgid 32 (task_name_demo) PFN xxxx prep_new_page+0xd0/0xf8 get_page_from_freelist+0x4a0/0x1290 __alloc_pages+0x168/0x340 alloc_pages+0xb0/0x158 If we want to cull the blocks by stacktrace and task command name, we can use this command: ./page_owner_sort --cull=stacktrace,name The output would be like: 2 times, 2 pages, task_comm_name: task_name_demo prep_new_page+0xd0/0xf8 get_page_from_freelist+0x4a0/0x1290 __alloc_pages+0x168/0x340 alloc_pages+0xb0/0x158 As we can see, these two blocks are culled successfully, for they share the same pid and task command name. However, if we want to cull the blocks by pid, stacktrace and task command name, we can this command: ./page_owner_sort --cull=stacktrace,name,pid The output would be like: 1 times, 1 pages, PID 1, task_comm_name: task_name_demo prep_new_page+0xd0/0xf8 get_page_from_freelist+0x4a0/0x1290 __alloc_pages+0x168/0x340 alloc_pages+0xb0/0x158 1 times, 1 pages, PID 32, task_comm_name: task_name_demo prep_new_page+0xd0/0xf8 get_page_from_freelist+0x4a0/0x1290 __alloc_pages+0x168/0x340 alloc_pages+0xb0/0x158 As we can see, these two blocks are failed to cull, for their PIDs are different. 2. Add explanations of --cull options to the document. This work is coauthored by Yixuan Cao Shenghong Han Yinan Zhang Chongxi Zhao Yuhong Feng Link: https://lkml.kernel.org/r/20220312145834.624-1-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Yixuan Cao Cc: Shenghong Han Cc: Yinan Zhang Cc: Chongxi Zhao Cc: Yuhong Feng Cc: Stephen Rothwell Cc: Sean Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 29 +++++- tools/vm/page_owner_sort.c | 150 +++++++++++++++++++++++++++----- 2 files changed, 157 insertions(+), 22 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 3dd31fe06c050..c4de6f8dabe99 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -126,11 +126,38 @@ Usage Cull: -c Cull by comparing stacktrace instead of total block. + --cull + Specify culling rules.Culling syntax is key[,key[,...]].Choose a + multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. + + + is a single argument in the form of a comma-separated list, + which offers a way to specify individual culling rules. The recognized + keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. + can be specified by the sequence of keys k1,k2, ..., as described in + the STANDARD SORT KEYS section below. Mixed use of abbreviated and + complete-form of keys is allowed. + + + Examples: + ./page_owner_sort --cull=stacktrace + ./page_owner_sort --cull=st,pid,name + ./page_owner_sort --cull=n,f Filter: -f Filter out the information of blocks whose memory has been released. Select: - --pid Select by pid. + --pid Select by pid. --tgid Select by tgid. --name Select by task command name. + +STANDARD FORMAT SPECIFIERS +========================== + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + f free whether the page has been released or not + st stacktrace stace trace of the page allocation diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index e873eff84462e..7679335fce5be 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -44,7 +44,14 @@ enum FILTER_BIT { FILTER_UNRELEASE = 1<<1, FILTER_PID = 1<<2, FILTER_TGID = 1<<3, - FILTER_TASK_COMM_NAME = 1<<4 + FILTER_COMM = 1<<4 +}; +enum CULL_BIT { + CULL_UNRELEASE = 1<<1, + CULL_PID = 1<<2, + CULL_TGID = 1<<3, + CULL_COMM = 1<<4, + CULL_STACKTRACE = 1<<5 }; struct filter_condition { pid_t tgid; @@ -61,7 +68,7 @@ static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; -static int cull_st; +static int cull; static int filter; int read_block(char *buf, int buf_size, FILE *fin) @@ -142,6 +149,36 @@ static int compare_free_ts(const void *p1, const void *p2) return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; } + +static int compare_release(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + if (!l1->free_ts_nsec && !l2->free_ts_nsec) + return 0; + if (l1->free_ts_nsec && l2->free_ts_nsec) + return 0; + return l1->free_ts_nsec ? 1 : -1; +} + + +static int compare_cull_condition(const void *p1, const void *p2) +{ + if (cull == 0) + return compare_txt(p1, p2); + if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) + return compare_stacktrace(p1, p2); + if ((cull & CULL_PID) && compare_pid(p1, p2)) + return compare_pid(p1, p2); + if ((cull & CULL_TGID) && compare_tgid(p1, p2)) + return compare_tgid(p1, p2); + if ((cull & CULL_COMM) && compare_comm(p1, p2)) + return compare_comm(p1, p2); + if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) + return compare_release(p1, p2); + return 0; +} + static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) { int err, val_len; @@ -170,6 +207,38 @@ static void check_regcomp(regex_t *pattern, const char *regex) } } +static char **explode(char sep, const char *str, int *size) +{ + int count = 0, len = strlen(str); + int lastindex = -1, j = 0; + + for (int i = 0; i < len; i++) + if (str[i] == sep) + count++; + char **ret = calloc(++count, sizeof(char *)); + + for (int i = 0; i < len; i++) { + if (str[i] == sep) { + ret[j] = calloc(i - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); + lastindex = i; + } + } + if (lastindex <= len - 1) { + ret[j] = calloc(len - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); + } + *size = j; + return ret; +} + +static void free_explode(char **arr, int size) +{ + for (int i = 0; i < size; i++) + free(arr[i]); + free(arr); +} + # define FIELD_BUFF 25 static int get_page_num(char *buf) @@ -277,16 +346,16 @@ static char *get_comm(char *buf) static bool is_need(char *buf) { - if ((filter & FILTER_UNRELEASE) != 0 && get_free_ts_nsec(buf) != 0) + if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) return false; - if ((filter & FILTER_PID) != 0 && get_pid(buf) != fc.pid) + if ((filter & FILTER_PID) && get_pid(buf) != fc.pid) return false; - if ((filter & FILTER_TGID) != 0 && get_tgid(buf) != fc.tgid) + if ((filter & FILTER_TGID) && get_tgid(buf) != fc.tgid) return false; char *comm = get_comm(buf); - if ((filter & FILTER_TASK_COMM_NAME) != 0 && + if ((filter & FILTER_COMM) && strncmp(comm, fc.comm, TASK_COMM_LEN) != 0) { free(comm); return false; @@ -335,6 +404,30 @@ static void add_list(char *buf, int len) } } +static bool parse_cull_args(const char *arg_str) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + + for (int i = 0; i < size; ++i) + if (!strcmp(args[i], "pid") || !strcmp(args[i], "p")) + cull |= CULL_PID; + else if (!strcmp(args[i], "tgid") || !strcmp(args[i], "tg")) + cull |= CULL_TGID; + else if (!strcmp(args[i], "name") || !strcmp(args[i], "n")) + cull |= CULL_COMM; + else if (!strcmp(args[i], "stacktrace") || !strcmp(args[i], "st")) + cull |= CULL_STACKTRACE; + else if (!strcmp(args[i], "free") || !strcmp(args[i], "f")) + cull |= CULL_UNRELEASE; + else { + free_explode(args, size); + return false; + } + free_explode(args, size); + return true; +} + #define BUF_SIZE (128 * 1024) static void usage(void) @@ -353,6 +446,7 @@ static void usage(void) "--pid \tSelect by pid. This selects the information of blocks whose process ID number equals to .\n" "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID number equals to .\n" "--name \n\t\tSelect by command name. This selects the information of blocks whose command name identical to .\n" + "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" ); } @@ -368,6 +462,7 @@ int main(int argc, char **argv) { "pid", required_argument, NULL, 1 }, { "tgid", required_argument, NULL, 2 }, { "name", required_argument, NULL, 3 }, + { "cull", required_argument, NULL, 4 }, { 0, 0, 0, 0}, }; @@ -377,7 +472,7 @@ int main(int argc, char **argv) cmp = compare_ts; break; case 'c': - cull_st = 1; + cull = cull | CULL_STACKTRACE; break; case 'f': filter = filter | FILTER_UNRELEASE; @@ -422,10 +517,17 @@ int main(int argc, char **argv) } break; case 3: - filter = filter | FILTER_TASK_COMM_NAME; + filter = filter | FILTER_COMM; strncpy(fc.comm, optarg, TASK_COMM_LEN); fc.comm[TASK_COMM_LEN-1] = '\0'; break; + case 4: + if (!parse_cull_args(optarg)) { + printf("wrong argument after --cull in from the command line:%s\n", + optarg); + exit(1); + } + break; default: usage(); exit(1); @@ -472,20 +574,13 @@ int main(int argc, char **argv) printf("sorting ....\n"); - if (cull_st == 1) - qsort(list, list_size, sizeof(list[0]), compare_stacktrace); - else - qsort(list, list_size, sizeof(list[0]), compare_txt); - - + qsort(list, list_size, sizeof(list[0]), compare_cull_condition); printf("culling\n"); - long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0; - for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(*(&list[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { + compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { list[count++] = list[i]; } else { list[count-1].num += list[i].num; @@ -496,12 +591,25 @@ int main(int argc, char **argv) qsort(list, count, sizeof(list[0]), cmp); for (i = 0; i < count; i++) { - if (cull_st == 0) + if (cull == 0) fprintf(fout, "%d times, %d pages:\n%s\n", list[i].num, list[i].page_num, list[i].txt); - else - fprintf(fout, "%d times, %d pages:\n%s\n", - list[i].num, list[i].page_num, list[i].stacktrace); + else { + fprintf(fout, "%d times, %d pages", + list[i].num, list[i].page_num); + if (cull & CULL_PID || filter & FILTER_PID) + fprintf(fout, ", PID %d", list[i].pid); + if (cull & CULL_TGID || filter & FILTER_TGID) + fprintf(fout, ", TGID %d", list[i].pid); + if (cull & CULL_COMM || filter & FILTER_COMM) + fprintf(fout, ", task_comm_name: %s", list[i].comm); + if (cull & CULL_UNRELEASE) + fprintf(fout, " (%s)", + list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); + if (cull & CULL_STACKTRACE) + fprintf(fout, ":\n%s", list[i].stacktrace); + fprintf(fout, "\n"); + } } regfree(&order_pattern); regfree(&pid_pattern); From 1a9762b2d7a56e9926a22a627972423840a1aabb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Mar 2022 18:09:40 -0700 Subject: [PATCH 022/113] mm: unexport page_init_poison page_init_poison is only used in core MM code, so unexport it. Link: https://lkml.kernel.org/r/20220207063446.1833404-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index eeb7ea3ca2924..bef329bf28f01 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -261,5 +261,4 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } -EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ From 90647d9d725068ad27d39f80d2b4a5150d041038 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Mar 2022 18:09:43 -0700 Subject: [PATCH 023/113] selftest/vm: add util.h and and move helper functions there Avoid code duplication by adding util.h. No functional change in this patch. Link: https://lkml.kernel.org/r/20220307054355.149820-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Shuah Khan Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 38 +-------------- tools/testing/selftests/vm/transhuge-stress.c | 41 ++-------------- tools/testing/selftests/vm/util.h | 48 +++++++++++++++++++ 3 files changed, 52 insertions(+), 75 deletions(-) create mode 100644 tools/testing/selftests/vm/util.h diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 1436e1a9a3d38..fd85f15869d13 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -12,6 +12,7 @@ #include "../kselftest.h" #include "../../../../include/vdso/time64.h" +#include "util.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) @@ -22,15 +23,6 @@ #define KSM_MERGE_ACROSS_NODES_DEFAULT true #define MB (1ul << 20) -#define PAGE_SHIFT 12 -#define HPAGE_SHIFT 21 - -#define PAGE_SIZE (1 << PAGE_SHIFT) -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - struct ksm_sysfs { unsigned long max_page_sharing; unsigned long merge_across_nodes; @@ -456,34 +448,6 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a return KSFT_FAIL; } -int64_t allocate_transhuge(void *ptr, int pagemap_fd) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) { void *map_ptr, *map_ptr_orig; diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c index a03cb3fce1f69..e3f00adb1b825 100644 --- a/tools/testing/selftests/vm/transhuge-stress.c +++ b/tools/testing/selftests/vm/transhuge-stress.c @@ -15,48 +15,12 @@ #include #include #include +#include "util.h" -#define PAGE_SHIFT 12 -#define HPAGE_SHIFT 21 - -#define PAGE_SIZE (1 << PAGE_SHIFT) -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - -int pagemap_fd; int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; #define PROT_RW (PROT_READ | PROT_WRITE) -int64_t allocate_transhuge(void *ptr) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_RW, MAP_FIXED | mmap_flags, - backing_fd, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - int main(int argc, char **argv) { size_t ram, len; @@ -67,6 +31,7 @@ int main(int argc, char **argv) double s; uint8_t *map; size_t map_len; + int pagemap_fd; ram = sysconf(_SC_PHYS_PAGES); if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) @@ -122,7 +87,7 @@ int main(int argc, char **argv) for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { int64_t pfn; - pfn = allocate_transhuge(p); + pfn = allocate_transhuge(p, pagemap_fd); if (pfn < 0) { nr_failed++; diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h new file mode 100644 index 0000000000000..0f0a0f345d768 --- /dev/null +++ b/tools/testing/selftests/vm/util.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KSELFTEST_VM_UTIL_H +#define __KSELFTEST_VM_UTIL_H + +#include +#include +#include + +#define PAGE_SHIFT 12 +#define HPAGE_SHIFT 21 + +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + + +static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +#endif From 6f6a841fb77df0bc6383133b22feb9b015380a55 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 24 Mar 2022 18:09:46 -0700 Subject: [PATCH 024/113] selftest/vm: add helpers to detect PAGE_SIZE and PAGE_SHIFT PAGE_SIZE is not 4096 in many configurations, particularly ppc64 uses 64K pages in majority of cases. Add helpers to detect PAGE_SIZE and PAGE_SHIFT dynamically. Without this tests are broken w.r.t reading /proc/self/pagemap if (pread(pagemap_fd, ent, sizeof(ent), (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) err(2, "read pagemap"); Link: https://lkml.kernel.org/r/20220307054355.149820-2-aneesh.kumar@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Aneesh Kumar K.V Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_test.c | 3 ++- tools/testing/selftests/vm/util.h | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index fe043f67798b0..cda837a147364 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -10,8 +10,9 @@ #include #include "../../../../mm/gup_test.h" +#include "util.h" + #define MB (1UL << 20) -#define PAGE_SIZE sysconf(_SC_PAGESIZE) /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h index 0f0a0f345d768..b27d26199334f 100644 --- a/tools/testing/selftests/vm/util.h +++ b/tools/testing/selftests/vm/util.h @@ -6,11 +6,32 @@ #include #include #include +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ -#define PAGE_SHIFT 12 -#define HPAGE_SHIFT 21 +static unsigned int __page_size; +static unsigned int __page_shift; -#define PAGE_SIZE (1 << PAGE_SHIFT) +static inline unsigned int page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SHIFT (page_shift()) +#define PAGE_SIZE (page_size()) +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 #define HPAGE_SIZE (1 << HPAGE_SHIFT) #define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) From bb43b14b576228c580bdc7e1aeeded54d540b5ef Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Mar 2022 18:09:49 -0700 Subject: [PATCH 025/113] mm: delete __ClearPageWaiters() The PG_waiters bit is not included in PAGE_FLAGS_CHECK_AT_FREE, and vmscan.c's free_unref_page_list() callers rely on that not to generate bad_page() alerts. So __page_cache_release(), put_pages_list() and release_pages() (and presumably copy-and-pasted free_zone_device_page()) are redundant and misleading to make a special point of clearing it (as the "__" implies, it could only safely be used on the freeing path). Delete __ClearPageWaiters(). Remark on this in one of the "possible" comments in folio_wake_bit(), and delete the superfluous comments. Link: https://lkml.kernel.org/r/3eafa969-5b1a-accf-88fe-318784c791a@google.com Signed-off-by: Hugh Dickins Tested-by: Yu Zhao Reviewed-by: Yang Shi Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Yu Zhao Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- mm/filemap.c | 23 ++++++++--------------- mm/memremap.c | 2 -- mm/swap.c | 4 ---- 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 88fe1d759cdd4..9d8eeaa67d05a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -481,7 +481,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) +PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) diff --git a/mm/filemap.c b/mm/filemap.c index d2e6a79fe69d7..98dfeff82ef06 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1185,24 +1185,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) } /* - * It is possible for other pages to have collided on the waitqueue - * hash, so in that case check for a page match. That prevents a long- - * term waiter + * It's possible to miss clearing waiters here, when we woke our page + * waiters, but the hashed waitqueue has waiters for other pages on it. + * That's okay, it's a rare case. The next waker will clear it. * - * It is still possible to miss a case here, when we woke page waiters - * and removed them from the waitqueue, but there are still other - * page waiters. + * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, + * other), the flag may be cleared in the course of freeing the page; + * but that is not required for correctness. */ - if (!waitqueue_active(q) || !key.page_match) { + if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); - /* - * It's possible to miss clearing Waiters here, when we woke - * our page waiters, but the hashed waitqueue has waiters for - * other pages on it. - * - * That's okay, it's a rare case. The next waker will clear it. - */ - } + spin_unlock_irqrestore(&q->lock, flags); } diff --git a/mm/memremap.c b/mm/memremap.c index c17eca4a48ca6..af0223605e697 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -456,8 +456,6 @@ void free_zone_device_page(struct page *page) if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) return; - __ClearPageWaiters(page); - mem_cgroup_uncharge(page_folio(page)); /* diff --git a/mm/swap.c b/mm/swap.c index 5b30045207e16..bceff0cb559c9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -97,7 +97,6 @@ static void __page_cache_release(struct page *page) mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } - __ClearPageWaiters(page); } static void __put_single_page(struct page *page) @@ -152,7 +151,6 @@ void put_pages_list(struct list_head *pages) continue; } /* Cannot be PageLRU because it's passed to us using the lru */ - __ClearPageWaiters(page); } free_unref_page_list(pages); @@ -971,8 +969,6 @@ void release_pages(struct page **pages, int nr) count_vm_event(UNEVICTABLE_PGCLEARED); } - __ClearPageWaiters(page); - list_add(&page->lru, &pages_to_free); } if (lruvec) From 85207ad8ea21156387fd0273e5360189df163661 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Mar 2022 18:09:52 -0700 Subject: [PATCH 026/113] mm: filemap_unaccount_folio() large skip mapcount fixup The page_mapcount_reset() when folio_mapped() while mapping_exiting() was devised long before there were huge or compound pages in the cache. It is still valid for small pages, but not at all clear what's right to check and reset on large pages. Just don't try when folio_test_large(). Link: https://lkml.kernel.org/r/879c4426-4122-da9c-1a86-697f2c9a083@google.com Signed-off-by: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 98dfeff82ef06..c5c169c16baed 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping, VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { - int mapcount; - pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(&folio->page); - if (mapping_exiting(mapping) && - folio_ref_count(folio) >= mapcount + 2) { - /* - * All vmas have already been torn down, so it's - * a good bet that actually the folio is unmapped, - * and we'd prefer not to leak it: if we're wrong, - * some other bad page check should catch it later. - */ - page_mapcount_reset(&folio->page); - folio_ref_sub(folio, mapcount); + if (mapping_exiting(mapping) && !folio_test_large(folio)) { + int mapcount = page_mapcount(&folio->page); + + if (folio_ref_count(folio) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped + * and we'd rather not leak it: if we're wrong, + * another bad page check should catch it later. + */ + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); + } } } From 5d543f13e2f5580828de885c751d68a35b6a493d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Mar 2022 18:09:55 -0700 Subject: [PATCH 027/113] mm/thp: fix NR_FILE_MAPPED accounting in page_*_file_rmap() NR_FILE_MAPPED accounting in mm/rmap.c (for /proc/meminfo "Mapped" and /proc/vmstat "nr_mapped" and the memcg's memory.stat "mapped_file") is slightly flawed for file or shmem huge pages. It is well thought out, and looks convincing, but there's a racy case when the careful counting in page_remove_file_rmap() (without page lock) gets discarded. So that in a workload like two "make -j20" kernel builds under memory pressure, with cc1 on hugepage text, "Mapped" can easily grow by a spurious 5MB or more on each iteration, ending up implausibly bigger than most other numbers in /proc/meminfo. And, hypothetically, might grow to the point of seriously interfering in mm/vmscan.c's heuristics, which do take NR_FILE_MAPPED into some consideration. Fixed by moving the __mod_lruvec_page_state() down to where it will not be missed before return (and I've grown a bit tired of that oft-repeated but-not-everywhere comment on the __ness: it gets lost in the move here). Does page_add_file_rmap() need the same change? I suspect not, because page lock is held in all relevant cases, and its skipping case looks safe; but it's much easier to be sure, if we do make the same change. Link: https://lkml.kernel.org/r/e02e52a1-8550-a57c-ed29-f51191ea2375@google.com Fixes: dd78fedde4b9 ("rmap: support file thp") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 615b5d323ee22..ee1f10df984da 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1236,14 +1236,14 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 1; + int i, nr = 0; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { int nr_pages = thp_nr_pages(page); - for (i = 0, nr = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } @@ -1271,11 +1271,12 @@ void page_add_file_rmap(struct page *page, VM_WARN_ON_ONCE(!PageLocked(page)); SetPageDoubleMap(compound_head(page)); } - if (!atomic_inc_and_test(&page->_mapcount)) - goto out; + if (atomic_inc_and_test(&page->_mapcount)) + nr++; } - __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); mlock_vma_page(page, vma, compound); @@ -1283,7 +1284,7 @@ void page_add_file_rmap(struct page *page, static void page_remove_file_rmap(struct page *page, bool compound) { - int i, nr = 1; + int i, nr = 0; VM_BUG_ON_PAGE(compound && !PageHead(page), page); @@ -1298,12 +1299,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (compound && PageTransHuge(page)) { int nr_pages = thp_nr_pages(page); - for (i = 0, nr = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; + goto out; if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); @@ -1311,16 +1312,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, -nr_pages); } else { - if (!atomic_add_negative(-1, &page->_mapcount)) - return; + if (atomic_add_negative(-1, &page->_mapcount)) + nr++; } - - /* - * We use the irq-unsafe __{inc|mod}_lruvec_page_state because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); +out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); } static void page_remove_anon_compound_rmap(struct page *page) From 283fd6fe0528ae2fe0222a35ecfdfcbbaeee8159 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Mar 2022 18:09:58 -0700 Subject: [PATCH 028/113] mm/migration: add trace events for THP migrations Patch series "mm/migration: Add trace events", v3. This adds trace events for all migration scenarios including base page, THP and HugeTLB. This patch (of 3): This adds two trace events for PMD based THP migration without split. These events closely follow the implementation details like setting and removing of PMD migration entries, which are essential operations for THP migration. This moves CREATE_TRACE_POINTS into generic THP from powerpc for these new trace events to be available on other platforms as well. Link: https://lkml.kernel.org/r/1643368182-9588-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1643368182-9588-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Steven Rostedt Cc: Ingo Molnar Cc: Zi Yan Cc: Naoya Horiguchi Cc: John Hubbard Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/trace.c | 1 - include/trace/events/thp.h | 27 +++++++++++++++++++++++++++ mm/huge_memory.c | 5 +++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c index b86e7b9062571..ccd64b5e6cac7 100644 --- a/arch/powerpc/mm/book3s64/trace.c +++ b/arch/powerpc/mm/book3s64/trace.c @@ -3,6 +3,5 @@ * This file is for defining trace points and trace related helpers. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define CREATE_TRACE_POINTS #include #endif diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index ca3f2767828a6..202b3e3e67ff2 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -48,6 +48,33 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DECLARE_EVENT_CLASS(migration_pmd, + + TP_PROTO(unsigned long addr, unsigned long pmd), + + TP_ARGS(addr, pmd), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pmd) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pmd = pmd; + ), + TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd) +); + +DEFINE_EVENT(migration_pmd, set_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); + +DEFINE_EVENT(migration_pmd, remove_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); #endif /* _TRACE_THP_H */ /* This part must be outside protection */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 005fab2f3b73a..035aa204ab8d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not @@ -3131,6 +3134,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); + trace_set_migration_pmd(address, pmd_val(pmdswp)); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -3163,5 +3167,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif From 4cc79b3303f224a920f3aff21f3d231749d73384 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Mar 2022 18:10:01 -0700 Subject: [PATCH 029/113] mm/migration: add trace events for base page and HugeTLB migrations This adds two trace events for base page and HugeTLB page migrations. These events, closely follow the implementation details like setting and removing of PTE migration entries, which are essential operations for migration. The new CREATE_TRACE_POINTS in covers both and based trace events. Hence drop redundant CREATE_TRACE_POINTS from other places which could have otherwise conflicted during build. Link: https://lkml.kernel.org/r/1643368182-9588-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reported-by: kernel test robot Cc: Steven Rostedt Cc: Ingo Molnar Cc: Zi Yan Cc: Naoya Horiguchi Cc: John Hubbard Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init.c | 1 - include/trace/events/migrate.h | 31 +++++++++++++++++++++++++++++++ mm/migrate.c | 4 +++- mm/rmap.c | 6 ++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4ba024d5b63ae..d8cfce221275e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -31,7 +31,6 @@ * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ -#define CREATE_TRACE_POINTS #include #include "mm_internal.h" diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 779f3fad9ecd5..061b5128f335a 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -105,6 +105,37 @@ TRACE_EVENT(mm_migrate_pages_start, __print_symbolic(__entry->reason, MIGRATE_REASON)) ); +DECLARE_EVENT_CLASS(migration_pte, + + TP_PROTO(unsigned long addr, unsigned long pte, int order), + + TP_ARGS(addr, pte, order), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pte) + __field(int, order) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pte = pte; + __entry->order = order; + ), + + TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order) +); + +DEFINE_EVENT(migration_pte, set_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + +DEFINE_EVENT(migration_pte, remove_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/migrate.c b/mm/migrate.c index 4f30ed37856f0..3d60823afd2d3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -53,7 +53,6 @@ #include -#define CREATE_TRACE_POINTS #include #include "internal.h" @@ -249,6 +248,9 @@ static bool remove_migration_pte(struct folio *folio, if (vma->vm_flags & VM_LOCKED) mlock_page_drain(smp_processor_id()); + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } diff --git a/mm/rmap.c b/mm/rmap.c index ee1f10df984da..bfcc8e3d412f7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,9 @@ #include +#define CREATE_TRACE_POINTS #include +#include #include "internal.h" @@ -1849,6 +1851,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + trace_set_migration_pte(pvmw.address, pte_val(swp_pte), + compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1917,6 +1921,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + trace_set_migration_pte(address, pte_val(swp_pte), + compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. From 94ae8b83fefcdaf281e0bcfb76a19f5ed5019c8d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:04 -0700 Subject: [PATCH 030/113] kasan, page_alloc: deduplicate should_skip_kasan_poison Patch series "kasan, vmalloc, arm64: add vmalloc tagging support for SW/HW_TAGS", v6. This patchset adds vmalloc tagging support for SW_TAGS and HW_TAGS KASAN modes. About half of patches are cleanups I went for along the way. None of them seem to be important enough to go through stable, so I decided not to split them out into separate patches/series. The patchset is partially based on an early version of the HW_TAGS patchset by Vincenzo that had vmalloc support. Thus, I added a Co-developed-by tag into a few patches. SW_TAGS vmalloc tagging support is straightforward. It reuses all of the generic KASAN machinery, but uses shadow memory to store tags instead of magic values. Naturally, vmalloc tagging requires adding a few kasan_reset_tag() annotations to the vmalloc code. HW_TAGS vmalloc tagging support stands out. HW_TAGS KASAN is based on Arm MTE, which can only assigns tags to physical memory. As a result, HW_TAGS KASAN only tags vmalloc() allocations, which are backed by page_alloc memory. It ignores vmap() and others. This patch (of 39): Currently, should_skip_kasan_poison() has two definitions: one for when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, one for when it's not. Instead of duplicating the checks, add a deferred_pages_enabled() helper and use it in a single should_skip_kasan_poison() definition. Also move should_skip_kasan_poison() closer to its caller and clarify all conditions in the comment. Link: https://lkml.kernel.org/r/cover.1643047180.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/658b79f5fb305edaf7dc16bc52ea870d3220d4a8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Peter Collingbourne Cc: Evgenii Stepanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e0b4596cde9b..3e7bbb5dae41e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1267,6 +1249,35 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; } +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} + static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; From 5b2c07138cbd8c0c415c6d3ff5b8040532024814 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:07 -0700 Subject: [PATCH 031/113] kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages Currently, kernel_init_free_pages() serves two purposes: it either only zeroes memory or zeroes both memory and memory tags via a different code path. As this function has only two callers, each using only one code path, this behaviour is confusing. Pull the code that zeroes both memory and tags out of kernel_init_free_pages(). As a result of this change, the code in free_pages_prepare() starts to look complicated, but this is improved in the few following patches. Those improvements are not integrated into this patch to make diffs easier to read. This patch does no functional changes. Link: https://lkml.kernel.org/r/7719874e68b23902629c7cf19f966c4fd5f57979.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e7bbb5dae41e..0721ff0c90be1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1278,16 +1278,10 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +static void kernel_init_free_pages(struct page *page, int numpages) { int i; - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { @@ -1383,7 +1377,7 @@ static __always_inline bool free_pages_prepare(struct page *page, bool init = want_init_on_free(); if (init) - kernel_init_free_pages(page, 1 << order, false); + kernel_init_free_pages(page, 1 << order); if (!skip_kasan_poison) kasan_poison_pages(page, order, init); } @@ -2378,9 +2372,17 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + if (init) { + if (gfp_flags & __GFP_ZEROTAGS) { + int i; + + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + } else { + kernel_init_free_pages(page, 1 << order); + } + } } set_page_owner(page, order, gfp_flags); From 7c13c163e036c646b77753deacfe2f5478b654bc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:10 -0700 Subject: [PATCH 032/113] kasan, page_alloc: merge kasan_free_pages into free_pages_prepare Currently, the code responsible for initializing and poisoning memory in free_pages_prepare() is scattered across two locations: kasan_free_pages() for HW_TAGS KASAN and free_pages_prepare() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches also simplify the performed checks to make them easier to follow. Replaces the only caller of kasan_free_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/303498d15840bb71905852955c6e2390ecc87139.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 8 -------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 11 ----------- mm/page_alloc.c | 6 ++++-- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b6a93261c92a0..dd2161af84e9f 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -85,7 +85,6 @@ static inline void kasan_disable_current(void) {} #ifdef CONFIG_KASAN_HW_TAGS void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); -void kasan_free_pages(struct page *page, unsigned int order); #else /* CONFIG_KASAN_HW_TAGS */ @@ -96,13 +95,6 @@ static __always_inline void kasan_alloc_pages(struct page *page, BUILD_BUG(); } -static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 92196562687b6..a0082fad48b12 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 7355cb534e4f8..0b8225add2e48 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -213,17 +213,6 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) } } -void kasan_free_pages(struct page *page, unsigned int order) -{ - /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. - */ - bool init = want_init_on_free(); - - kasan_poison_pages(page, order, init); -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0721ff0c90be1..d16df446d2ca0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1364,15 +1364,17 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ if (kasan_has_integrated_init()) { + bool init = want_init_on_free(); + if (!skip_kasan_poison) - kasan_free_pages(page, order); + kasan_poison_pages(page, order, init); } else { bool init = want_init_on_free(); From c3525330a04d0a47b4e11f5cf6d44e21a6520885 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:13 -0700 Subject: [PATCH 033/113] kasan, page_alloc: simplify kasan_poison_pages call site Simplify the code around calling kasan_poison_pages() in free_pages_prepare(). This patch does no functional changes. Link: https://lkml.kernel.org/r/ae4f9bcf071577258e786bcec4798c145d718c46.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d16df446d2ca0..4f1257e397601 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1298,6 +1298,7 @@ static __always_inline bool free_pages_prepare(struct page *page, { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1370,19 +1371,10 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - bool init = want_init_on_free(); - - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } else { - bool init = want_init_on_free(); - - if (init) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); /* * arch_free_page() can make the page's contents inaccessible. s390 From db8a04774a8195c529f1e87cd1df87f116559b52 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:16 -0700 Subject: [PATCH 034/113] kasan, page_alloc: init memory of skipped pages on free Since commit 7a3b83537188 ("kasan: use separate (un)poison implementation for integrated init"), when all init, kasan_has_integrated_init(), and skip_kasan_poison are true, free_pages_prepare() doesn't initialize the page. This is wrong. Fix it by remembering whether kasan_poison_pages() performed initialization, and call kernel_init_free_pages() if it didn't. Reordering kasan_poison_pages() and kernel_init_free_pages() is OK, since kernel_init_free_pages() can handle poisoned memory. Link: https://lkml.kernel.org/r/1d97df75955e52727a3dc1c4e33b3b50506fc3fd.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f1257e397601..3da959402db8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1371,11 +1371,16 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (init && !kasan_has_integrated_init()) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) + if (!skip_kasan_poison) { kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; + } + if (init) + kernel_init_free_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should From 487a32ec24be819e747af8c2ab0d5c515508086a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:19 -0700 Subject: [PATCH 035/113] kasan: drop skip_kasan_poison variable in free_pages_prepare skip_kasan_poison is only used in a single place. Call should_skip_kasan_poison() directly for simplicity. Link: https://lkml.kernel.org/r/1d33212e79bc9ef0b4d3863f903875823e89046f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3da959402db8a..94a220d661b14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1297,7 +1297,6 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1371,7 +1370,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (!skip_kasan_poison) { + if (!should_skip_kasan_poison(page, fpi_flags)) { kasan_poison_pages(page, order, init); /* Memory is already initialized if KASAN did it internally. */ From c82ce3195fd17e57a370e4dc8f36c195b8807b95 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:22 -0700 Subject: [PATCH 036/113] mm: clarify __GFP_ZEROTAGS comment __GFP_ZEROTAGS is intended as an optimization: if memory is zeroed during allocation, it's possible to set memory tags at the same time with little performance impact. Clarify this intention of __GFP_ZEROTAGS in the comment. Link: https://lkml.kernel.org/r/cdffde013973c5634a447513e10ec0d21e8eee29.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 20f6fbe129931..72dc4ca75cf3a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -232,8 +232,10 @@ struct vm_area_struct; * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if - * __GFP_ZERO is set. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is + * intended for optimization: setting memory tags at the same time as zeroing + * memory has minimal additional performace impact. * * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned * on deallocation. Typically used for userspace pages. Currently only has an From 1c0e5b24f11707e8c8300060bfdd9ae4ae76611b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:25 -0700 Subject: [PATCH 037/113] kasan: only apply __GFP_ZEROTAGS when memory is zeroed __GFP_ZEROTAGS should only be effective if memory is being zeroed. Currently, hardware tag-based KASAN violates this requirement. Fix by including an initialization check along with checking for __GFP_ZEROTAGS. Link: https://lkml.kernel.org/r/f4f4593f7f675262d29d07c1938db5bd0cd5e285.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/hw_tags.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 0b8225add2e48..c643740b85996 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -199,11 +199,12 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) * page_alloc.c. */ bool init = !want_init_on_free() && want_init_on_alloc(flags); + bool init_tags = init && (flags & __GFP_ZEROTAGS); if (flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (flags & __GFP_ZEROTAGS) { + if (init_tags) { int i; for (i = 0; i != 1 << order; ++i) From b8491b9052fef036aac0ca3afc18ef223aef6f61 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:28 -0700 Subject: [PATCH 038/113] kasan, page_alloc: refactor init checks in post_alloc_hook Separate code for zeroing memory from the code clearing tags in post_alloc_hook(). This patch is not useful by itself but makes the simplifications in the following patches easier to follow. This patch does no functional changes. Link: https://lkml.kernel.org/r/2283fde963adfd8a2b29a92066f106cc16661a3c.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94a220d661b14..d2d6e48d2d836 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2368,19 +2368,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order, kasan_alloc_pages(page, order, gfp_flags); } else { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); kasan_unpoison_pages(page, order, init); - if (init) { - if (gfp_flags & __GFP_ZEROTAGS) { - int i; + if (init_tags) { + int i; - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - } else { - kernel_init_free_pages(page, 1 << order); - } + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + + init = false; } + + if (init) + kernel_init_free_pages(page, 1 << order); } set_page_owner(page, order, gfp_flags); From b42090ae6f3aa07b0a39403545d688489548a6a8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:31 -0700 Subject: [PATCH 039/113] kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook Currently, the code responsible for initializing and poisoning memory in post_alloc_hook() is scattered across two locations: kasan_alloc_pages() hook for HW_TAGS KASAN and post_alloc_hook() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches do a step-by-step restructure the many performed checks to make them easier to follow. Replace the only caller of kasan_alloc_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. Also move init and init_tags variables definitions out of kasan_has_integrated_init() clause in post_alloc_hook(), as they have the same values regardless of what the if condition evaluates to. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/5ac7e0b30f5cbb177ec363ddd7878a3141289592.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 9 --------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 22 ---------------------- mm/page_alloc.c | 20 +++++++++++++++----- 4 files changed, 16 insertions(+), 37 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dd2161af84e9f..4ed94616c8f07 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -84,17 +84,8 @@ static inline void kasan_disable_current(void) {} #ifdef CONFIG_KASAN_HW_TAGS -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); - #else /* CONFIG_KASAN_HW_TAGS */ -static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order, gfp_t flags) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index a0082fad48b12..d9079ec11f313 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index c643740b85996..76cf2b6229c79 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,28 +192,6 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) -{ - /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. - */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); - bool init_tags = init && (flags & __GFP_ZEROTAGS); - - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); - } -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2d6e48d2d836..db82facf14e75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2346,6 +2346,9 @@ static inline bool check_new_pcp(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + set_page_private(page, 0); set_page_refcounted(page); @@ -2361,15 +2364,22 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + if (gfp_flags & __GFP_SKIP_KASAN_POISON) + SetPageSkipKASanPoison(page); + + if (init_tags) { + int i; + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + } else { + kasan_unpoison_pages(page, order, init); + } + } else { kasan_unpoison_pages(page, order, init); if (init_tags) { From 9294b1281d0a212ef775a175b98ce71e6ac27b90 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:34 -0700 Subject: [PATCH 040/113] kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook Move tag_clear_highpage() loops out of the kasan_has_integrated_init() clause as a code simplification. This patch does no functional changes. Link: https://lkml.kernel.org/r/587e3fc36358b88049320a89cc8dc6deaecb0cda.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db82facf14e75..9d14ec37fd12a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2367,30 +2367,30 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ + + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + int i; + + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } if (kasan_has_integrated_init()) { if (gfp_flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { + if (!init_tags) kasan_unpoison_pages(page, order, init); - } } else { kasan_unpoison_pages(page, order, init); - if (init_tags) { - int i; - - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - - init = false; - } - if (init) kernel_init_free_pages(page, 1 << order); } From 89b2711633281b3d712b1df96c5065a82ccbfb9c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:37 -0700 Subject: [PATCH 041/113] kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook Pull the SetPageSkipKASanPoison() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patches. Also turn the kasan_has_integrated_init() check into the proper kasan_hw_tags_enabled() one. These checks evaluate to the same value, but logically skipping kasan poisoning has nothing to do with integrated init. Link: https://lkml.kernel.org/r/7214c1698b754ccfaa44a792113c95cc1f807c48.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d14ec37fd12a..7a5040d8e48eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2383,9 +2383,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (gfp_flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - if (!init_tags) kasan_unpoison_pages(page, order, init); } else { @@ -2394,6 +2391,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, if (init) kernel_init_free_pages(page, 1 << order); } + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); From 7e3cbba65de22f20ad18a2de09f65238bfe84c5b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:40 -0700 Subject: [PATCH 042/113] kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook Pull the kernel_init_free_pages() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patch. This patch does no functional changes. Link: https://lkml.kernel.org/r/a7a76456501eb37ddf9fca6529cee9555e59cdb1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a5040d8e48eb..386898f1a7c39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2383,14 +2383,18 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (!init_tags) + if (!init_tags) { kasan_unpoison_pages(page, order, init); + + /* Note that memory is already initialized by KASAN. */ + init = false; + } } else { kasan_unpoison_pages(page, order, init); - - if (init) - kernel_init_free_pages(page, 1 << order); } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_free_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); From e9d0ca9228162f5442b751edf8c9721b15dcfa1e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:43 -0700 Subject: [PATCH 043/113] kasan, page_alloc: rework kasan_unpoison_pages call site Rework the checks around kasan_unpoison_pages() call in post_alloc_hook(). The logical condition for calling this function is: - If a software KASAN mode is enabled, we need to mark shadow memory. - Otherwise, HW_TAGS KASAN is enabled, and it only makes sense to set tags if they haven't already been cleared by tag_clear_highpage(), which is indicated by init_tags. This patch concludes the changes for post_alloc_hook(). Link: https://lkml.kernel.org/r/0ecebd0d7ccd79150e3620ea4185a32d3dfe912f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 386898f1a7c39..350bb27dc4bb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2382,15 +2382,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - if (kasan_has_integrated_init()) { - if (!init_tags) { - kasan_unpoison_pages(page, order, init); + /* + * If either a software KASAN mode is enabled, or, + * in the case of hardware tag-based KASAN, + * if memory tags have not been cleared via tag_clear_highpage(). + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS) || + kasan_hw_tags_enabled() && !init_tags) { + /* Mark shadow memory or set memory tags. */ + kasan_unpoison_pages(page, order, init); - /* Note that memory is already initialized by KASAN. */ + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) init = false; - } - } else { - kasan_unpoison_pages(page, order, init); } /* If memory is still not initialized, do it now. */ if (init) From fe1ac91edb9a6170255c5c6882639bd4f2b20eb8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:46 -0700 Subject: [PATCH 044/113] kasan: clean up metadata byte definitions Most of the metadata byte values are only used for Generic KASAN. Remove KASAN_KMALLOC_FREETRACK definition for !CONFIG_KASAN_GENERIC case, and put it along with other metadata values for the Generic mode under a corresponding ifdef. Link: https://lkml.kernel.org/r/ac11d6e9e007c95e472e8fdd22efb6074ef3c6d8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c17fa8d26ffe5..952cd6f9ca464 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,15 +71,16 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID #endif +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ #define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ @@ -110,6 +111,8 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 From 00a756133bb91388e63257a53e250865b94a0970 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:49 -0700 Subject: [PATCH 045/113] kasan: define KASAN_VMALLOC_INVALID for SW_TAGS In preparation for adding vmalloc support to SW_TAGS KASAN, provide a KASAN_VMALLOC_INVALID definition for it. HW_TAGS KASAN won't be using this value, as it falls back onto page_alloc for poisoning freed vmalloc() memory. Link: https://lkml.kernel.org/r/1daaaafeb148a7ae8285265edc97d7ca07b6a07d.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 952cd6f9ca464..020f3e57a03f5 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,18 +71,19 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC #define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ /* * Stack redzone shadow values From 63840de296472f3914bb933b11ba2b764590755e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:52 -0700 Subject: [PATCH 046/113] kasan, x86, arm64, s390: rename functions for modules shadow Rename kasan_free_shadow to kasan_free_module_shadow and kasan_module_alloc to kasan_alloc_module_shadow. These functions are used to allocate/free shadow memory for kernel modules when KASAN_VMALLOC is not enabled. The new names better reflect their purpose. Also reword the comment next to their declaration to improve clarity. Link: https://lkml.kernel.org/r/36db32bde765d5d0b856f77d2d806e838513fe84.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/module.c | 2 +- arch/s390/kernel/module.c | 2 +- arch/x86/kernel/module.c | 2 +- include/linux/kasan.h | 14 +++++++------- mm/kasan/shadow.c | 4 ++-- mm/vmalloc.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 309a27553c875..d3a1fa8183487 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b032e556eeb71..a7aefc278909b 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -45,7 +45,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 96d7c27b7093d..504ea65987e87 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -78,7 +78,7 @@ void *module_alloc(unsigned long size) MODULES_END, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4ed94616c8f07..b450c76531375 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -433,17 +433,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, !defined(CONFIG_KASAN_VMALLOC) /* - * These functions provide a special case to support backing module - * allocations with real shadow memory. With KASAN vmalloc, the special - * case is unnecessary, as the work is handled in the generic case. + * These functions allocate and free shadow memory for kernel modules. + * They are only required when KASAN_VMALLOC is not supported, as otherwise + * shadow memory is allocated by the generic vmalloc handlers. */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); -void kasan_free_shadow(const struct vm_struct *vm); +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); +void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } -static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } +static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 94136f84b4497..e5c4393eb861e 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -534,7 +534,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 99e0f3e8d1a53..feecd614bb773 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2547,7 +2547,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; From 0b7ccc70ee1d5b499d1626c9d28f729507b1c036 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:55 -0700 Subject: [PATCH 047/113] kasan, vmalloc: drop outdated VM_KASAN comment The comment about VM_KASAN in include/linux/vmalloc.c is outdated. VM_KASAN is currently only used to mark vm_areas allocated for kernel modules when CONFIG_KASAN_VMALLOC is disabled. Drop the comment. Link: https://lkml.kernel.org/r/780395afea83a147b3b5acc36cf2e38f7f8479f9.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 5a0c3b5568484..2ca95c7db4638 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif -/* - * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. - * - * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after - * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poison shadow on free if it was never allocated. - * - * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to - * determine which allocations need the module shadow freed. - */ - /* bits [20..32] reserved for arch specific ioremap internals */ /* From 5bd9bae22a455321327982d0b595a7e76d425fd0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:10:58 -0700 Subject: [PATCH 048/113] kasan: reorder vmalloc hooks Group functions that [de]populate shadow memory for vmalloc. Group functions that [un]poison memory for vmalloc. This patch does no functional changes but prepares KASAN code for adding vmalloc support to HW_TAGS KASAN. Link: https://lkml.kernel.org/r/aeef49eb249c206c4c9acce2437728068da74c28.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 20 +++++++++----------- mm/kasan/shadow.c | 43 ++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b450c76531375..5f9ed1372eef5 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -397,34 +397,32 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); +void kasan_unpoison_vmalloc(const void *start, unsigned long size); +void kasan_poison_vmalloc(const void *start, unsigned long size); #else /* CONFIG_KASAN_VMALLOC */ +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } - -static inline void kasan_poison_vmalloc(const void *start, unsigned long size) -{ } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) {} + unsigned long free_region_end) { } -static inline void kasan_populate_early_vm_area_shadow(void *start, - unsigned long size) +static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ } +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e5c4393eb861e..bf7ab62fbfb94 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -496,6 +475,28 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } + +void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + kasan_unpoison(start, size, false); +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) From 579fb0ac085be2015529a5f7bd1061899a6374de Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:01 -0700 Subject: [PATCH 049/113] kasan: add wrappers for vmalloc hooks Add wrappers around functions that [un]poison memory for vmalloc allocations. These functions will be used by HW_TAGS KASAN and therefore need to be disabled when kasan=off command line argument is provided. This patch does no functional changes for software KASAN modes. Link: https://lkml.kernel.org/r/3b8728eac438c55389fb0f9a8a2145d71dd77487.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 17 +++++++++++++++-- mm/kasan/shadow.c | 5 ++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5f9ed1372eef5..a6d6fdefb278d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -403,8 +403,21 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); +void __kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_unpoison_vmalloc(start, size); +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_poison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_poison_vmalloc(start, size); +} #else /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index bf7ab62fbfb94..39d0b32ebf708 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) +void __kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; @@ -488,7 +487,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) * Poison the shadow for a vmalloc region. Called as part of the * freeing process at the time the region is freed. */ -void kasan_poison_vmalloc(const void *start, unsigned long size) +void __kasan_poison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; From 4aff1dc4fb3a5a3be96b6ad8db8348d3e877d7d0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:04 -0700 Subject: [PATCH 050/113] kasan, vmalloc: reset tags in vmalloc functions In preparation for adding vmalloc support to SW/HW_TAGS KASAN, reset pointer tags in functions that use pointer values in range checks. vread() is a special case here. Despite the untagging of the addr pointer in its prologue, the accesses performed by vread() are checked. Instead of accessing the virtual mappings though addr directly, vread() recovers the physical address via page_address(vmalloc_to_page()) and acceses that. And as page_address() recovers the pointer tag, the accesses get checked. Link: https://lkml.kernel.org/r/046003c5f683cacb0ba18e1079e9688bb3dca943.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index feecd614bb773..ab0a0002bfcae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -631,7 +631,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -795,6 +795,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -816,6 +818,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -2166,7 +2170,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -3424,6 +3428,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; From c08e6a1206e6876c66e0528b3ec717f557b07dd4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:07 -0700 Subject: [PATCH 051/113] kasan, fork: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in alloc_thread_stack_node(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. Link: https://lkml.kernel.org/r/c6c96f012371ecd80e1936509ebcd3b07a5956f7.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index 834af51ed3979..ce0ffe48937f0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -326,6 +326,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ tsk->stack_vm_area = vm; + if (stack) + stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; } From 51fb34de2a4c8fa0f221246313700bfe3b6c586d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:10 -0700 Subject: [PATCH 052/113] kasan, arm64: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in arch_alloc_vmap_stack(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. [andreyknvl@google.com: fix case when a stack is retrieved from cached_stacks] Link: https://lkml.kernel.org/r/f50c5f96ef896d7936192c888b0c0a7674e33184.1644943792.git.andreyknvl@google.com [dan.carpenter@oracle.com: remove unnecessary check in alloc_thread_stack_node()] Link: https://lkml.kernel.org/r/20220301080706.GB17208@kili Link: https://lkml.kernel.org/r/698c5ab21743c796d46c15d075b9481825973e34.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Dan Carpenter Acked-by: Catalin Marinas Acked-by: Marco Elver Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmap_stack.h | 5 ++++- kernel/fork.c | 11 ++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 894e031b28d28..20873099c035c 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -17,10 +17,13 @@ */ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { + void *p; + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, __builtin_return_address(0)); + return kasan_reset_tag(p); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/kernel/fork.c b/kernel/fork.c index ce0ffe48937f0..9796897560ab1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -286,11 +286,13 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Mark stack accessible for KASAN. */ + /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); + stack = kasan_reset_tag(s->addr); + /* Clear stale pointers from reused stack. */ - memset(s->addr, 0, THREAD_SIZE); + memset(stack, 0, THREAD_SIZE); if (memcg_charge_kernel_stack(s)) { vfree(s->addr); @@ -298,7 +300,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) } tsk->stack_vm_area = s; - tsk->stack = s->addr; + tsk->stack = stack; return 0; } @@ -326,8 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ tsk->stack_vm_area = vm; - if (stack) - stack = kasan_reset_tag(stack); + stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; } From 1d96320f8d5320423737da61e6e6937f6b475b5c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:13 -0700 Subject: [PATCH 053/113] kasan, vmalloc: add vmalloc tagging for SW_TAGS Add vmalloc tagging support to SW_TAGS KASAN. - __kasan_unpoison_vmalloc() now assigns a random pointer tag, poisons the virtual mapping accordingly, and embeds the tag into the returned pointer. - __get_vm_area_node() (used by vmalloc() and vmap()) and pcpu_get_vm_areas() save the tagged pointer into vm_struct->addr (note: not into vmap_area->addr). This requires putting kasan_unpoison_vmalloc() after setup_vmalloc_vm[_locked](); otherwise the latter will overwrite the tagged pointer. The tagged pointer then is naturally propagateed to vmalloc() and vmap(). - vm_map_ram() returns the tagged pointer directly. As a result of this change, vm_struct->addr is now tagged. Enabling KASAN_VMALLOC with SW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/4a78f3c064ce905e9070c29733aca1dd254a74f1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 16 ++++++++++------ mm/kasan/shadow.c | 6 ++++-- mm/vmalloc.c | 14 ++++++++------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a6d6fdefb278d..d3efb6cbd0f51 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -403,12 +403,13 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void __kasan_unpoison_vmalloc(const void *start, unsigned long size); -static __always_inline void kasan_unpoison_vmalloc(const void *start, - unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) { if (kasan_enabled()) - __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size); + return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); @@ -433,8 +434,11 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) { } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } +static inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + return (void *)start; +} static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 39d0b32ebf708..5a866f6663fc0 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,12 +475,14 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void __kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) - return; + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); + return (void *)start; } /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ab0a0002bfcae..1ab1f1b2f5b73 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2231,7 +2231,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); + mem = kasan_unpoison_vmalloc(mem, size); if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { @@ -2464,10 +2464,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + return area; } @@ -3815,9 +3815,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3830,6 +3827,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); + /* mark allocated areas as accessible */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size); + kfree(vas); return vms; From 01d92c7f358ce892279ca830cf6ccf2862a17d1c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:16 -0700 Subject: [PATCH 054/113] kasan, vmalloc, arm64: mark vmalloc mappings as pgprot_tagged HW_TAGS KASAN relies on ARM Memory Tagging Extension (MTE). With MTE, a memory region must be mapped as MT_NORMAL_TAGGED to allow setting memory tags via MTE-specific instructions. Add proper protection bits to vmalloc() allocations. These allocations are always backed by page_alloc pages, so the tags will actually be getting set on the corresponding physical memory. Link: https://lkml.kernel.org/r/983fc33542db2f6b1e77b34ca23448d4640bbb9e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmalloc.h | 6 ++++++ include/linux/vmalloc.h | 7 +++++++ mm/vmalloc.c | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index b9185503feae2..38fafffe699f7 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) #endif +#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return pgprot_tagged(prot); +} + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2ca95c7db4638..3b1df7da402d6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) } #endif +#ifndef arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return prot; +} +#endif + /* * Highlevel APIs for driver use */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1ab1f1b2f5b73..8530d86c3e585 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3128,6 +3128,15 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } + /* + * Modify protection bits to allow tagging. + * This must be done before mapping by __vmalloc_area_node(). + */ + if (kasan_hw_tags_enabled() && + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + prot = arch_vmap_pgprot_tagged(prot); + + /* Allocate physical pages and map them into vmalloc space. */ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) goto fail; From 19f1c3acf8f4431982355b2e8a78e42b884dd788 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:20 -0700 Subject: [PATCH 055/113] kasan, vmalloc: unpoison VM_ALLOC pages after mapping Make KASAN unpoison vmalloc mappings after they have been mapped in when it's possible: for vmalloc() (indentified via VM_ALLOC) and vm_map_ram(). The reasons for this are: - For vmalloc() and vm_map_ram(): pages don't get unpoisoned in case mapping them fails. - For vmalloc(): HW_TAGS KASAN needs pages to be mapped to set tags via kasan_unpoison_vmalloc(). As a part of these changes, the return value of __vmalloc_node_range() is changed to area->addr. This is a non-functional change, as __vmalloc_area_node() returns area->addr anyway. Link: https://lkml.kernel.org/r/fcb98980e6fcd3c4be6acdcb5d6110898ef28548.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8530d86c3e585..8da8501db942f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2231,14 +2231,15 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - mem = kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* Mark the pages as accessible, now that they are mapped. */ + mem = kasan_unpoison_vmalloc(mem, size); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2466,7 +2467,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); return area; } @@ -3075,7 +3083,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3137,10 +3145,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, prot = arch_vmap_pgprot_tagged(prot); /* Allocate physical pages and map them into vmalloc space. */ - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; + /* Mark the pages as accessible, now that they are mapped. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3152,7 +3163,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3836,7 +3847,10 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); - /* mark allocated areas as accessible */ + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size); From f49d9c5bb15cc5c470097078ec1f26ff605ae1a2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:23 -0700 Subject: [PATCH 056/113] kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS Only define the ___GFP_SKIP_KASAN_POISON flag when CONFIG_KASAN_HW_TAGS is enabled. This patch it not useful by itself, but it prepares the code for additions of new KASAN-specific GFP patches. Link: https://lkml.kernel.org/r/44e5738a584c11801b2b8f1231898918efc8634a.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 +++++++- include/trace/events/mmflags.h | 12 +++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 72dc4ca75cf3a..75abf8d5bc9bc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -54,7 +54,11 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u +#ifdef CONFIG_KASAN_HW_TAGS #define ___GFP_SKIP_KASAN_POISON 0x1000000u +#else +#define ___GFP_SKIP_KASAN_POISON 0 +#endif #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x2000000u #else @@ -251,7 +255,9 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (24 + \ + IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 116ed4d5d0f88..cb4520374e2c8 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -49,12 +49,18 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\ + {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + +#ifdef CONFIG_KASAN_HW_TAGS +#define __def_gfpflag_names_kasan \ + , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#else +#define __def_gfpflag_names_kasan +#endif #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - __def_gfpflag_names \ + __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" #ifdef CONFIG_MMU From 53ae233c30a623ff44ff2f83854e92530c5d9fc2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:26 -0700 Subject: [PATCH 057/113] kasan, page_alloc: allow skipping unpoisoning for HW_TAGS Add a new GFP flag __GFP_SKIP_KASAN_UNPOISON that allows skipping KASAN poisoning for page_alloc allocations. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip KASAN poisoning for these pages in page_alloc is because vmalloc code will be poisoning them instead. Also reword the comment for __GFP_SKIP_KASAN_POISON. Link: https://lkml.kernel.org/r/35c97d77a704f6ff971dd3bfe4be95855744108e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 21 +++++++++++++-------- include/trace/events/mmflags.h | 5 +++-- mm/page_alloc.c | 31 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 75abf8d5bc9bc..688a39fde43eb 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,12 +55,14 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_POISON 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u +#define ___GFP_SKIP_KASAN_POISON 0x2000000u #else +#define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x4000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -241,22 +243,25 @@ struct vm_area_struct; * intended for optimization: setting memory tags at the same time as zeroing * memory has minimal additional performace impact. * - * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned - * on deallocation. Typically used for userspace pages. Currently only has an - * effect in HW tags mode. + * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. + * Only effective in HW_TAGS mode. + * + * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. + * Typically, used for userspace pages. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) +#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (24 + \ - IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ +#define __GFP_BITS_SHIFT (24 + \ + 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index cb4520374e2c8..134c45e62d918 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -52,8 +52,9 @@ {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ #ifdef CONFIG_KASAN_HW_TAGS -#define __def_gfpflag_names_kasan \ - , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else #define __def_gfpflag_names_kasan #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 350bb27dc4bb6..59a9408e1e7d4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2343,6 +2343,26 @@ static inline bool check_new_pcp(struct page *page, unsigned int order) } #endif /* CONFIG_DEBUG_VM */ +static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if either: + * + * 1. Memory tags have already been cleared via tag_clear_highpage(). + * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -2382,15 +2402,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - /* - * If either a software KASAN mode is enabled, or, - * in the case of hardware tag-based KASAN, - * if memory tags have not been cleared via tag_clear_highpage(). - */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS) || - kasan_hw_tags_enabled() && !init_tags) { - /* Mark shadow memory or set memory tags. */ + if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); /* Note that memory is already initialized by KASAN. */ From 9353ffa6e9e90d2b6348209cf2b95a8ffee18711 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:29 -0700 Subject: [PATCH 058/113] kasan, page_alloc: allow skipping memory init for HW_TAGS Add a new GFP flag __GFP_SKIP_ZERO that allows to skip memory initialization. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip memory initialization for these pages in page_alloc is because vmalloc code will be initializing them instead. With the current implementation, when __GFP_SKIP_ZERO is provided, __GFP_ZEROTAGS is ignored. This doesn't matter, as these two flags are never provided at the same time. However, if this is changed in the future, this particular implementation detail can be changed as well. Link: https://lkml.kernel.org/r/0d53efeff345de7d708e0baa0d8829167772521e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 18 +++++++++++------- include/trace/events/mmflags.h | 1 + mm/page_alloc.c | 13 ++++++++++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 688a39fde43eb..0fa17fb85de5d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,14 +55,16 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u -#define ___GFP_SKIP_KASAN_POISON 0x2000000u +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u +#define ___GFP_SKIP_KASAN_POISON 0x4000000u #else +#define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP 0x8000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -239,9 +241,10 @@ struct vm_area_struct; * %__GFP_ZERO returns a zeroed page on success. * * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is - * intended for optimization: setting memory tags at the same time as zeroing - * memory has minimal additional performace impact. + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performace impact. * * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. * Only effective in HW_TAGS mode. @@ -253,6 +256,7 @@ struct vm_area_struct; #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) #define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) #define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) @@ -261,7 +265,7 @@ struct vm_area_struct; /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (24 + \ - 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + 3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 134c45e62d918..6532119a6bf1a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -53,6 +53,7 @@ #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59a9408e1e7d4..bdc8f60ae4622 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2363,10 +2363,21 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); } +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); set_page_private(page, 0); From 23689e91fb22c15b84ac6c22ad9942039792f3af Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:32 -0700 Subject: [PATCH 059/113] kasan, vmalloc: add vmalloc tagging for HW_TAGS Add vmalloc tagging support to HW_TAGS KASAN. The key difference between HW_TAGS and the other two KASAN modes when it comes to vmalloc: HW_TAGS KASAN can only assign tags to physical memory. The other two modes have shadow memory covering every mapped virtual memory region. Make __kasan_unpoison_vmalloc() for HW_TAGS KASAN: - Skip non-VM_ALLOC mappings as HW_TAGS KASAN can only tag a single mapping of normal physical memory; see the comment in the function. - Generate a random tag, tag the returned pointer and the allocation, and initialize the allocation at the same time. - Propagate the tag into the page stucts to allow accesses through page_address(vmalloc_to_page()). The rest of vmalloc-related KASAN hooks are not needed: - The shadow-related ones are fully skipped. - __kasan_poison_vmalloc() is kept as a no-op with a comment. Poisoning and zeroing of physical pages that are backing vmalloc() allocations are skipped via __GFP_SKIP_KASAN_UNPOISON and __GFP_SKIP_ZERO: __kasan_unpoison_vmalloc() does that instead. Enabling CONFIG_KASAN_VMALLOC with HW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/d19b2e9e59a9abc59d05b72dea8429dcaea739c6.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 36 +++++++++++++++-- kernel/scs.c | 4 +- mm/kasan/hw_tags.c | 92 +++++++++++++++++++++++++++++++++++++++++++ mm/kasan/shadow.c | 10 ++++- mm/vmalloc.c | 51 ++++++++++++++++++------ 5 files changed, 175 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d3efb6cbd0f51..65fc5285bb59c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -26,6 +26,12 @@ struct kunit_kasan_expectation { #endif +typedef unsigned int __bitwise kasan_vmalloc_flags_t; + +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -397,18 +403,39 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) +{ + return 0; +} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) { } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { if (kasan_enabled()) - return __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } @@ -435,7 +462,8 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_end) { } static inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { return (void *)start; } diff --git a/kernel/scs.c b/kernel/scs.c index 579841be88646..b83bc9251f996 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,7 +32,7 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); memset(s, 0, SCS_SIZE); return s; } @@ -78,7 +78,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); vfree_atomic(s); } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 76cf2b6229c79..21104fd518727 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,6 +192,98 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) +{ + struct vm_struct *area; + int i; + + /* + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. + */ + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + page_kasan_tag_set(page, tag); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC + * mappings as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + return (void *)start; + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ +} + +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5a866f6663fc0..b958babc8feda 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,16 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) { + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8da8501db942f..185ab3e27d130 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2237,8 +2237,12 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) return NULL; } - /* Mark the pages as accessible, now that they are mapped. */ - mem = kasan_unpoison_vmalloc(mem, size); + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); return mem; } @@ -2472,9 +2476,12 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_NONE); return area; } @@ -3084,6 +3091,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; + kasan_vmalloc_flags_t kasan_flags; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3136,21 +3144,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* - * Modify protection bits to allow tagging. - * This must be done before mapping by __vmalloc_area_node(). - */ + /* Prepare arguments for __vmalloc_area_node(). */ if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping in __vmalloc_area_node(). + */ prot = arch_vmap_pgprot_tagged(prot); + /* + * Skip page_alloc poisoning and zeroing for physical pages + * backing VM_ALLOC mapping. Memory is instead poisoned and + * zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; - /* Mark the pages as accessible, now that they are mapped. */ - area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* + * Mark the pages as accessible, now that they are mapped. + * The init condition should match the one in post_alloc_hook() + * (except for the should_skip_init() check) to make sure that memory + * is initialized under the same conditions regardless of the enabled + * KASAN mode. + */ + kasan_flags = KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + kasan_flags |= KASAN_VMALLOC_INIT; + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -3850,10 +3876,13 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size); + vms[area]->size, + KASAN_VMALLOC_NONE); kfree(vas); return vms; From f6e39794f4b6da7ca9b77f2f9ad11fd6f0ac83e5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:35 -0700 Subject: [PATCH 060/113] kasan, vmalloc: only tag normal vmalloc allocations The kernel can use to allocate executable memory. The only supported way to do that is via __vmalloc_node_range() with the executable bit set in the prot argument. (vmap() resets the bit via pgprot_nx()). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Only tag the allocations for normal kernel pages. [andreyknvl@google.com: pass KASAN_VMALLOC_PROT_NORMAL to kasan_unpoison_vmalloc()] Link: https://lkml.kernel.org/r/9230ca3d3e40ffca041c133a524191fd71969a8d.1646233925.git.andreyknvl@google.com [andreyknvl@google.com: support tagged vmalloc mappings] Link: https://lkml.kernel.org/r/2f6605e3a358cf64d73a05710cb3da356886ad29.1646233925.git.andreyknvl@google.com [andreyknvl@google.com: don't unintentionally disabled poisoning] Link: https://lkml.kernel.org/r/de4587d6a719232e83c760113e46ed2d4d8da61e.1646757322.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/fbfd9939a4dc375923c9a5c6b9e7ab05c26b8c6b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 7 ++++--- kernel/scs.c | 12 +++++++---- mm/kasan/hw_tags.c | 7 +++++++ mm/kasan/shadow.c | 8 +++++++ mm/vmalloc.c | 49 +++++++++++++++++++++++++------------------ 5 files changed, 56 insertions(+), 27 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 65fc5285bb59c..903b9453931dc 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -28,9 +28,10 @@ struct kunit_kasan_expectation { typedef unsigned int __bitwise kasan_vmalloc_flags_t; -#define KASAN_VMALLOC_NONE 0x00u -#define KASAN_VMALLOC_INIT 0x01u -#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_PROT_NORMAL 0x04u #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/kernel/scs.c b/kernel/scs.c index b83bc9251f996..b7e1b096d9060 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,15 +32,19 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); + s = kasan_unpoison_vmalloc(s, SCS_SIZE, + KASAN_VMALLOC_PROT_NORMAL); memset(s, 0, SCS_SIZE); - return s; + goto out; } } - return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_SCS, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + +out: + return kasan_reset_tag(s); } void *scs_alloc(int node) @@ -78,7 +82,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL); vfree_atomic(s); } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 21104fd518727..2e9378a4f07f1 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -247,6 +247,13 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!(flags & KASAN_VMALLOC_VM_ALLOC)) return (void *)start; + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + tag = kasan_random_tag(); start = set_tag(start, tag); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index b958babc8feda..a4f07de217711 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -488,6 +488,14 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!is_vmalloc_or_module_addr(start)) return (void *)start; + /* + * Don't tag executable memory with the tag-based mode. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + !(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 185ab3e27d130..e163372d39679 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2242,7 +2242,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } @@ -2481,7 +2481,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, - KASAN_VMALLOC_NONE); + KASAN_VMALLOC_PROT_NORMAL); return area; } @@ -3091,7 +3091,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; - kasan_vmalloc_flags_t kasan_flags; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3144,21 +3144,28 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* Prepare arguments for __vmalloc_area_node(). */ - if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - /* - * Modify protection bits to allow tagging. - * This must be done before mapping in __vmalloc_area_node(). - */ - prot = arch_vmap_pgprot_tagged(prot); + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); - /* - * Skip page_alloc poisoning and zeroing for physical pages - * backing VM_ALLOC mapping. Memory is instead poisoned and - * zeroed by kasan_unpoison_vmalloc(). - */ - gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ @@ -3172,10 +3179,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * (except for the should_skip_init() check) to make sure that memory * is initialized under the same conditions regardless of the enabled * KASAN mode. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). */ - kasan_flags = KASAN_VMALLOC_VM_ALLOC; + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* @@ -3881,8 +3891,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, - KASAN_VMALLOC_NONE); + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; From 36c4a73bf8d24721222d9ee4080ac955973fd388 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:38 -0700 Subject: [PATCH 061/113] kasan, arm64: don't tag executable vmalloc allocations Besides asking vmalloc memory to be executable via the prot argument of __vmalloc_node_range() (see the previous patch), the kernel can skip that bit and instead mark memory as executable via set_memory_x(). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Generic kernel code typically allocates memory via module_alloc() if it intends to mark memory as executable. (On arm64 module_alloc() uses __vmalloc_node_range() without setting the executable bit). Thus, reset pointer tags of pointers returned from module_alloc(). However, on arm64 there's an exception: the eBPF subsystem. Instead of using module_alloc(), it uses vmalloc() (via bpf_jit_alloc_exec()) to allocate its JIT region. Thus, reset pointer tags of pointers returned from bpf_jit_alloc_exec(). Resetting tags for these pointers results in untagged pointers being passed to set_memory_x(). This causes conflicts in arithmetic checks in change_memory_common(), as vm_struct->addr pointer returned by find_vm_area() is tagged. Reset pointer tag of find_vm_area(addr)->addr in change_memory_common(). Link: https://lkml.kernel.org/r/b7b2595423340cd7d76b770e5d519acf3b72f0ab.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/module.c | 3 ++- arch/arm64/mm/pageattr.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index d3a1fa8183487..f2d4bb14bfabe 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -63,7 +63,8 @@ void *module_alloc(unsigned long size) return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a3bacd79507a4..64e985eaa52d8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages, */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)area->addr + area->size || + end > (unsigned long)kasan_reset_tag(area->addr) + area->size || !(area->flags & VM_ALLOC)) return -EINVAL; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e850c69e128c1..fcc675aa1670d 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1304,7 +1304,8 @@ u64 bpf_jit_alloc_exec_limit(void) void *bpf_jit_alloc_exec(unsigned long size) { - return vmalloc(size); + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(vmalloc(size)); } void bpf_jit_free_exec(void *addr) From 1eeac51e6201c1c0980fcdae73dbbe34811f8503 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:41 -0700 Subject: [PATCH 062/113] kasan: mark kasan_arg_stacktrace as __initdata As kasan_arg_stacktrace is only used in __init functions, mark it as __initdata instead of __ro_after_init to allow it be freed after boot. The other enums for KASAN args are used in kasan_init_hw_tags_cpu(), which is not marked as __init as a CPU can be hot-plugged after boot. Clarify this in a comment. Link: https://lkml.kernel.org/r/7fa090865614f8e0c6c1265508efb1d429afaa50.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/hw_tags.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2e9378a4f07f1..6509809dd5d8c 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -40,7 +40,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -116,7 +116,10 @@ static inline const char *kasan_mode_info(void) return "sync"; } -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* From 241944d1628e77f578d309ec57528b01cdef78d3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:44 -0700 Subject: [PATCH 063/113] kasan: clean up feature flags for HW_TAGS mode - Untie kasan_init_hw_tags() code from the default values of kasan_arg_mode and kasan_arg_stacktrace. - Move static_branch_enable(&kasan_flag_enabled) to the end of kasan_init_hw_tags_cpu(). - Remove excessive comments in kasan_arg_mode switch. - Add new comments. Link: https://lkml.kernel.org/r/76ebb340265be57a218564a497e1f52ff36a3879.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/hw_tags.c | 38 +++++++++++++++++++++----------------- mm/kasan/kasan.h | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6509809dd5d8c..6a3146d1ccc55 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -42,16 +42,22 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/ +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -127,7 +133,11 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; @@ -154,42 +164,36 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - */ - fallthrough; + /* Default is specified by kasan_mode definition. */ + break; case KASAN_ARG_MODE_SYNC: - /* Sync mode enabled. */ kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ kasan_mode = KASAN_MODE_ASYNC; break; case KASAN_ARG_MODE_ASYMM: - /* Asymm mode enabled. */ kasan_mode = KASAN_MODE_ASYMM; break; } switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + /* Default is specified by kasan_flag_stacktrace definition. */ break; case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + static_branch_disable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); break; } + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", kasan_mode_info(), kasan_stack_collection_enabled() ? "on" : "off"); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 020f3e57a03f5..efda13a9ce6ad 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,7 +12,7 @@ #include #include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, From 551b2bcb7e7a9219f176a5abc56c6d981d8068cf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:47 -0700 Subject: [PATCH 064/113] kasan: add kasan.vmalloc command line flag Allow disabling vmalloc() tagging for HW_TAGS KASAN via a kasan.vmalloc command line switch. This is a fail-safe switch intended for production systems that enable HW_TAGS KASAN. In case vmalloc() tagging ends up having an issue not detected during testing but that manifests in production, kasan.vmalloc allows to turn vmalloc() tagging off while leaving page_alloc/slab tagging on. Link: https://lkml.kernel.org/r/904f6d4dfa94870cc5fc2660809e093fd0d27c3b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/hw_tags.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- mm/kasan/kasan.h | 6 ++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6a3146d1ccc55..fad1887e54c05 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -32,6 +32,12 @@ enum kasan_arg_mode { KASAN_ARG_MODE_ASYMM, }; +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -40,6 +46,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* @@ -56,6 +63,9 @@ EXPORT_SYMBOL(kasan_flag_enabled); enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); @@ -95,6 +105,23 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -179,6 +206,18 @@ void __init kasan_init_hw_tags(void) break; } + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default is specified by kasan_flag_stacktrace definition. */ @@ -194,8 +233,9 @@ void __init kasan_init_hw_tags(void) /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", kasan_stack_collection_enabled() ? "on" : "off"); } @@ -228,6 +268,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, u8 tag; unsigned long redzone_start, redzone_size; + if (!kasan_vmalloc_enabled()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index efda13a9ce6ad..4d67408e84076 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,6 +12,7 @@ #include #include "../slab.h" +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { @@ -22,6 +23,11 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); From fbefb423f873a8713a2ad42b5dcb4585b3e58e01 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:50 -0700 Subject: [PATCH 065/113] kasan: allow enabling KASAN_VMALLOC and SW/HW_TAGS Allow enabling CONFIG_KASAN_VMALLOC with SW_TAGS and HW_TAGS KASAN modes. Also adjust CONFIG_KASAN_VMALLOC description: - Mention HW_TAGS support. - Remove unneeded internal details: they have no place in Kconfig description and are already explained in the documentation. Link: https://lkml.kernel.org/r/bfa0fdedfe25f65e5caa4e410f074ddbac7a0b59.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 879757b6dd149..1f3e620188a24 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY memory consumption. config KASAN_VMALLOC - bool "Back mappings in vmalloc space with real shadow memory" - depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC + bool "Check accesses to vmalloc allocations" + depends on HAVE_ARCH_KASAN_VMALLOC help - By default, the shadow region for vmalloc space is the read-only - zero page. This means that KASAN cannot detect errors involving - vmalloc space. - - Enabling this option will hook in to vmap/vmalloc and back those - mappings with real shadow memory allocated on demand. This allows - for KASAN to detect more sorts of errors (and to support vmapped - stacks), but at the cost of higher memory usage. + This mode makes KASAN check accesses to vmalloc allocations for + validity. + + With software KASAN modes, checking is done for all types of vmalloc + allocations. Enabling this option leads to higher memory usage. + + With hardware tag-based KASAN, only VM_ALLOC mappings are checked. + There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS From f6f37d9320a11e9059f11a99fc59dfb8e307c07f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:53 -0700 Subject: [PATCH 066/113] arm64: select KASAN_VMALLOC for SW/HW_TAGS modes Generic KASAN already selects KASAN_VMALLOC to allow VMAP_STACK to be selected unconditionally, see commit acc3042d62cb9 ("arm64: Kconfig: select KASAN_VMALLOC if KANSAN_GENERIC is enabled"). The same change is needed for SW_TAGS KASAN. HW_TAGS KASAN does not require enabling KASAN_VMALLOC for VMAP_STACK, they already work together as is. Still, selecting KASAN_VMALLOC still makes sense to make vmalloc() always protected. In case any bugs in KASAN's vmalloc() support are discovered, the command line kasan.vmalloc flag can be used to disable vmalloc() checking. Select KASAN_VMALLOC for all KASAN modes for arm64. Link: https://lkml.kernel.org/r/99d6b3ebf57fc1930ff71f9a4a71eea19881b270.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f8680892401b2..23048be0333bc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -208,7 +208,7 @@ config ARM64 select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOC if KASAN_GENERIC + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH From 8479d7b5be2fecaed2b78331c3bf443cc19dfcf5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:56 -0700 Subject: [PATCH 067/113] kasan: documentation updates Update KASAN documentation: - Bump Clang version requirement for HW_TAGS as ARM64_MTE depends on AS_HAS_LSE_ATOMICS as of commit 2decad92f4731 ("arm64: mte: Ensure TIF_MTE_ASYNC_FAULT is set atomically"), which requires Clang 12. - Add description of the new kasan.vmalloc command line flag. - Mention that SW_TAGS and HW_TAGS modes now support vmalloc tagging. - Explicitly say that the "Shadow memory" section is only applicable to software KASAN modes. - Mention that shadow-based KASAN_VMALLOC is supported on arm64. Link: https://lkml.kernel.org/r/a61189128fa3f9fbcfd9884ff653d401864b8e74.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 8089c559d339c..7614a1fc30fac 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang. The hardware KASAN mode (#3) relies on hardware to perform the checks but still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 11+. +This mode is supported in GCC 10+ and Clang 12+. Both software KASAN modes work with SLUB and SLAB memory allocators, while the hardware tag-based KASAN currently only supports SLUB. @@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features: Asymmetric mode: a bad access is detected synchronously on reads and asynchronously on writes. +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). @@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Software tag-based KASAN currently only supports tagging of slab, page_alloc, +and vmalloc memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Hardware tag-based KASAN currently only supports tagging of slab, page_alloc, +and VM_ALLOC-based vmalloc memory. If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. @@ -319,6 +322,8 @@ checking gets disabled. Shadow memory ------------- +The contents of this section are only applicable to software KASAN modes. + The kernel maps memory in several different parts of the address space. The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be @@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the cost of greater memory usage. Currently, this is supported on x86, -riscv, s390, and powerpc. +arm64, riscv, s390, and powerpc. This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. From 1a2473f0cbc059f345952654e393df317f7e270c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:11:59 -0700 Subject: [PATCH 068/113] kasan: improve vmalloc tests Update the existing vmalloc_oob() test to account for the specifics of the tag-based modes. Also add a few new checks and comments. Add new vmalloc-related tests: - vmalloc_helpers_tags() to check that exported vmalloc helpers can handle tagged pointers. - vmap_tags() to check that SW_TAGS mode properly tags vmap() mappings. - vm_map_ram_tags() to check that SW_TAGS mode properly tags vm_map_ram() mappings. - vmalloc_percpu() to check that SW_TAGS mode tags regions allocated for __alloc_percpu(). The tagging of per-cpu mappings is best-effort; proper tagging is tracked in [1]. [1] https://bugzilla.kernel.org/show_bug.cgi?id=215019 [sfr@canb.auug.org.au: similar to "kasan: test: fix compatibility with FORTIFY_SOURCE"] Link: https://lkml.kernel.org/r/20220128144801.73f5ced0@canb.auug.org.au Link: https://lkml.kernel.org/r/865c91ba49b90623ab50c7526b79ccb955f544f0.1644950160.git.andreyknvl@google.com [andreyknvl@google.com: set_memory_rw/ro() are not exported to modules] Link: https://lkml.kernel.org/r/019ac41602e0c4a7dfe96dc8158a95097c2b2ebd.1645554036.git.andreyknvl@google.com [akpm@linux-foundation.org: fix build] Cc: Andrey Konovalov [andreyknvl@google.com: vmap_tags() and vm_map_ram_tags() pass invalid page array size] Link: https://lkml.kernel.org/r/bbdc1c0501c5275e7f26fdb8e2a7b14a40a9f36b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Stephen Rothwell Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 194 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 188 insertions(+), 6 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 3b413f8c8a715..2addd0c66acfc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -1057,21 +1058,186 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + +#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST) + { + int rv; + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + } +#endif + + vfree(ptr); +} + static void vmalloc_oob(struct kunit *test) { - void *area; + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + OPTIMIZER_HIDE_VAR(v_ptr); + /* - * We have to be careful not to hit the guard page. + * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. */ - area = vmalloc(3000); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); - vfree(area); + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, 1); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1); + free_pages((unsigned long)p_ptr, 1); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); } /* @@ -1105,6 +1271,18 @@ static void match_all_not_assigned(struct kunit *test) KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); free_pages((unsigned long)ptr, order); } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } } /* Check that 0xff works as a match-all pointer tag for tag-based modes. */ @@ -1210,7 +1388,11 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), From ed6d74446cbfb88c747d4b32477a9d46132694db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:02 -0700 Subject: [PATCH 069/113] kasan: test: support async (again) and asymm modes for HW_TAGS Async mode support has already been implemented in commit e80a76aa1a91 ("kasan, arm64: tests supports for HW_TAGS async mode") but then got accidentally broken in commit 99734b535d9b ("kasan: detect false-positives in tests"). Restore the changes removed by the latter patch and adapt them for asymm mode: add a sync_fault flag to kunit_kasan_expectation that only get set if the MTE fault was synchronous, and reenable MTE on such faults in tests. Also rename kunit_kasan_expectation to kunit_kasan_status and move its definition to mm/kasan/kasan.h from include/linux/kasan.h, as this structure is only internally used by KASAN. Also put the structure definition under IS_ENABLED(CONFIG_KUNIT). Link: https://lkml.kernel.org/r/133970562ccacc93ba19d754012c562351d4a8c8.1645033139.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 5 ----- lib/test_kasan.c | 39 ++++++++++++++++++++++----------------- mm/kasan/hw_tags.c | 18 +++++++++--------- mm/kasan/kasan.h | 14 ++++++++++++-- mm/kasan/report.c | 17 +++++++++-------- 5 files changed, 52 insertions(+), 41 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 903b9453931dc..fe36215807f7a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -19,11 +19,6 @@ struct task_struct; #include #include -/* kasan_data struct is used in KUnit tests for KASAN expected failures */ -struct kunit_kasan_expectation { - bool report_found; -}; - #endif typedef unsigned int __bitwise kasan_vmalloc_flags_t; diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 2addd0c66acfc..72391f9926894 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -37,7 +37,7 @@ void *kasan_ptr_result; int kasan_int_result; static struct kunit_resource resource; -static struct kunit_kasan_expectation fail_data; +static struct kunit_kasan_status test_status; static bool multishot; /* @@ -54,58 +54,63 @@ static int kasan_test_init(struct kunit *test) } multishot = kasan_save_enable_multi_shot(); - fail_data.report_found = false; + test_status.report_found = false; + test_status.sync_fault = false; kunit_add_named_resource(test, NULL, NULL, &resource, - "kasan_data", &fail_data); + "kasan_status", &test_status); return 0; } static void kasan_test_exit(struct kunit *test) { kasan_restore_multi_shot(multishot); - KUNIT_EXPECT_FALSE(test, fail_data.report_found); + KUNIT_EXPECT_FALSE(test, test_status.report_found); } /** * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a * KASAN report; causes a test failure otherwise. This relies on a KUnit - * resource named "kasan_data". Do not use this name for KUnit resources + * resource named "kasan_status". Do not use this name for KUnit resources * outside of KASAN tests. * - * For hardware tag-based KASAN in sync mode, when a tag fault happens, tag + * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables * tag checking. As tag checking can be only disabled or enabled per CPU, * this handler disables migration (preemption). * - * Since the compiler doesn't see that the expression can change the fail_data + * Since the compiler doesn't see that the expression can change the test_status * fields, it can reorder or optimize away the accesses to those fields. * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the * expression to prevent that. * - * In between KUNIT_EXPECT_KASAN_FAIL checks, fail_data.report_found is kept as - * false. This allows detecting KASAN reports that happen outside of the checks - * by asserting !fail_data.report_found at the start of KUNIT_EXPECT_KASAN_FAIL - * and in kasan_test_exit. + * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * as false. This allows detecting KASAN reports that happen outside of the + * checks by asserting !test_status.report_found at the start of + * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. */ #define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) \ migrate_disable(); \ - KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found)); \ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ barrier(); \ expression; \ barrier(); \ - if (!READ_ONCE(fail_data.report_found)) { \ + if (kasan_async_fault_possible()) \ + kasan_force_async_fault(); \ + if (!READ_ONCE(test_status.report_found)) { \ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ "expected in \"" #expression \ "\", but none occurred"); \ } \ - if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ - if (READ_ONCE(fail_data.report_found)) \ - kasan_enable_tagging_sync(); \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) { \ + if (READ_ONCE(test_status.report_found) && \ + READ_ONCE(test_status.sync_fault)) \ + kasan_enable_tagging(); \ migrate_enable(); \ } \ - WRITE_ONCE(fail_data.report_found, false); \ + WRITE_ONCE(test_status.report_found, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index fad1887e54c05..07a76c46daa5a 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -172,12 +172,7 @@ void kasan_init_hw_tags_cpu(void) * Enable async or asymm modes only when explicitly requested * through the command line. */ - if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) - hw_enable_tagging_async(); - else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) - hw_enable_tagging_asymm(); - else - hw_enable_tagging_sync(); + kasan_enable_tagging(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -343,11 +338,16 @@ void __kasan_poison_vmalloc(const void *start, unsigned long size) #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void) +void kasan_enable_tagging(void) { - hw_enable_tagging_sync(); + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) + hw_enable_tagging_asymm(); + else + hw_enable_tagging_sync(); } -EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); +EXPORT_SYMBOL_GPL(kasan_enable_tagging); void kasan_force_async_fault(void) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4d67408e84076..d1e111b7d5d81 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,6 +7,16 @@ #include #include +#if IS_ENABLED(CONFIG_KUNIT) + +/* Used in KUnit-compatible KASAN tests. */ +struct kunit_kasan_status { + bool report_found; + bool sync_fault; +}; + +#endif + #ifdef CONFIG_KASAN_HW_TAGS #include @@ -350,12 +360,12 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging_sync(void); +void kasan_enable_tagging(void); void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_enable_tagging_sync(void) { } +static inline void kasan_enable_tagging(void) { } static inline void kasan_force_async_fault(void) { } #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f14146563d412..137c2c0b09db6 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -336,20 +336,21 @@ static bool report_enabled(void) } #if IS_ENABLED(CONFIG_KUNIT) -static void kasan_update_kunit_status(struct kunit *cur_test) +static void kasan_update_kunit_status(struct kunit *cur_test, bool sync) { struct kunit_resource *resource; - struct kunit_kasan_expectation *kasan_data; + struct kunit_kasan_status *status; - resource = kunit_find_named_resource(cur_test, "kasan_data"); + resource = kunit_find_named_resource(cur_test, "kasan_status"); if (!resource) { kunit_set_failure(cur_test); return; } - kasan_data = (struct kunit_kasan_expectation *)resource->data; - WRITE_ONCE(kasan_data->report_found, true); + status = (struct kunit_kasan_status *)resource->data; + WRITE_ONCE(status->report_found, true); + WRITE_ONCE(status->sync_fault, sync); kunit_put_resource(resource); } #endif /* IS_ENABLED(CONFIG_KUNIT) */ @@ -363,7 +364,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) #if IS_ENABLED(CONFIG_KUNIT) if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); + kasan_update_kunit_status(current->kunit_test, true); #endif /* IS_ENABLED(CONFIG_KUNIT) */ start_report(&flags); @@ -383,7 +384,7 @@ void kasan_report_async(void) #if IS_ENABLED(CONFIG_KUNIT) if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); + kasan_update_kunit_status(current->kunit_test, false); #endif /* IS_ENABLED(CONFIG_KUNIT) */ start_report(&flags); @@ -405,7 +406,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, #if IS_ENABLED(CONFIG_KUNIT) if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test); + kasan_update_kunit_status(current->kunit_test, true); #endif /* IS_ENABLED(CONFIG_KUNIT) */ disable_trace_on_warning(); From 09eb911d934561238d45fe298bf5d920759fa572 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Thu, 24 Mar 2022 18:12:05 -0700 Subject: [PATCH 070/113] mm/kasan: remove unnecessary CONFIG_KASAN option In mm/Makefile has: obj-$(CONFIG_KASAN) += kasan/ So that we don't need 'obj-$(CONFIG_KASAN) :=' in mm/kasan/Makefile, delete it from mm/kasan/Makefile. Link: https://lkml.kernel.org/r/20220221065421.20689-1-tangmeng@uniontech.com Signed-off-by: tangmeng Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitriy Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index adcd9acaef615..1f84df9c302e7 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -obj-$(CONFIG_KASAN) := common.o report.o +obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o From 2dfd1bd992322f7ab8db82d7862cb758c3be63fa Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 24 Mar 2022 18:12:08 -0700 Subject: [PATCH 071/113] kasan: update function name in comments The function kasan_global_oob was renamed to kasan_global_oob_right, but the comments referring to it were not updated. Do so. Link: https://linux-review.googlesource.com/id/I20faa90126937bbee77d9d44709556c3dd4b40be Link: https://lkml.kernel.org/r/20220219012433.890941-1-pcc@google.com Signed-off-by: Peter Collingbourne Reviewed-by: Miaohe Lin Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 72391f9926894..ad880231dfa84 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -786,7 +786,7 @@ static void ksize_uaf(struct kunit *test) static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = stack_array; char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; @@ -799,7 +799,7 @@ static void kasan_alloca_oob_left(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = alloca_array; char *p = array - 1; @@ -814,7 +814,7 @@ static void kasan_alloca_oob_right(struct kunit *test) { volatile int i = 10; char alloca_array[i]; - /* See comment in kasan_global_oob. */ + /* See comment in kasan_global_oob_right. */ char *volatile array = alloca_array; char *p = array + i; From c056a364e9546bd513d1f5205f0ee316d8acb910 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:11 -0700 Subject: [PATCH 072/113] kasan: print virtual mapping info in reports Print virtual mapping range and its creator in reports affecting virtual mappings. Also get physical page pointer for such mappings, so page information gets printed as well. Link: https://lkml.kernel.org/r/6ebb11210ae21253198e264d4bb0752c1fad67d7.1645548178.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Mark Rutland Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 137c2c0b09db6..f64352008bb85 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -260,8 +260,21 @@ static void print_address_description(void *addr, u8 tag) pr_err(" %pS\n", addr); } + if (is_vmalloc_addr(addr)) { + struct vm_struct *va = find_vm_area(addr); + + if (va) { + pr_err("The buggy address belongs to the virtual mapping at\n" + " [%px, %px) created by:\n" + " %pS\n", + va->addr, va->addr + va->size, va->caller); + + page = vmalloc_to_page(page); + } + } + if (page) { - pr_err("The buggy address belongs to the page:\n"); + pr_err("The buggy address belongs to the physical page:\n"); dump_page(page, "kasan: bad access detected"); } From 7131c883f995df9288f70879e501016119d4d971 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:14 -0700 Subject: [PATCH 073/113] kasan: drop addr check from describe_object_addr Patch series "kasan: report clean-ups and improvements". A number of clean-up patches for KASAN reporting code. Most are non-functional and only improve readability. This patch (of 22): describe_object_addr() used to be called with NULL addr in the early days of KASAN. This no longer happens, so drop the check. Link: https://lkml.kernel.org/r/cover.1646237226.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/761f8e5a6ee040d665934d916a90afe9f322f745.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f64352008bb85..607a8c2e46743 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -162,9 +162,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); - if (!addr) - return; - if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; From 038fd2b4cb42c64afa3cacad53f9e0f514b8d02a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:17 -0700 Subject: [PATCH 074/113] kasan: more line breaks in reports Add a line break after each part that describes the buggy address. Improves readability of reports. Link: https://lkml.kernel.org/r/8682c4558e533cd0f99bdb964ce2fe741f2a9212.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 607a8c2e46743..ded648c0a0e4e 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -250,11 +250,13 @@ static void print_address_description(void *addr, u8 tag) void *object = nearest_obj(cache, slab, addr); describe_object(cache, object, addr, tag); + pr_err("\n"); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { pr_err("The buggy address belongs to the variable:\n"); pr_err(" %pS\n", addr); + pr_err("\n"); } if (is_vmalloc_addr(addr)) { @@ -265,6 +267,7 @@ static void print_address_description(void *addr, u8 tag) " [%px, %px) created by:\n" " %pS\n", va->addr, va->addr + va->size, va->caller); + pr_err("\n"); page = vmalloc_to_page(page); } @@ -273,9 +276,11 @@ static void print_address_description(void *addr, u8 tag) if (page) { pr_err("The buggy address belongs to the physical page:\n"); dump_page(page, "kasan: bad access detected"); + pr_err("\n"); } kasan_print_address_stack_frame(addr); + pr_err("\n"); } static bool meta_row_is_guilty(const void *row, const void *addr) @@ -382,7 +387,6 @@ void kasan_report_invalid_free(void *object, unsigned long ip) kasan_print_tags(tag, object); pr_err("\n"); print_address_description(object, tag); - pr_err("\n"); print_memory_metadata(object); end_report(&flags, (unsigned long)object); } @@ -443,7 +447,6 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, if (addr_has_metadata(untagged_addr)) { print_address_description(untagged_addr, get_tag(tagged_addr)); - pr_err("\n"); print_memory_metadata(info.first_bad_addr); } else { dump_stack_lvl(KERN_ERR); From 0f9b35f38318d21470975bec61e0e93b4aab5c3c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:20 -0700 Subject: [PATCH 075/113] kasan: rearrange stack frame info in reports - Move printing stack frame info before printing page info. - Add object_is_on_stack() check to print_address_description() and add a corresponding WARNING to kasan_print_address_stack_frame(). This looks more in line with the rest of the checks in this function and also allows to avoid complicating code logic wrt line breaks. - Clean up comments related to get_address_stack_frame_info(). Link: https://lkml.kernel.org/r/1ee113a4c111df97d168c820b527cda77a3cac40.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 12 +++++++++--- mm/kasan/report_generic.c | 15 ++++----------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ded648c0a0e4e..d60ee8b81e2b4 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -259,6 +259,15 @@ static void print_address_description(void *addr, u8 tag) pr_err("\n"); } + if (object_is_on_stack(addr)) { + /* + * Currently, KASAN supports printing frame information only + * for accesses to the task's own stack. + */ + kasan_print_address_stack_frame(addr); + pr_err("\n"); + } + if (is_vmalloc_addr(addr)) { struct vm_struct *va = find_vm_area(addr); @@ -278,9 +287,6 @@ static void print_address_description(void *addr, u8 tag) dump_page(page, "kasan: bad access detected"); pr_err("\n"); } - - kasan_print_address_stack_frame(addr); - pr_err("\n"); } static bool meta_row_is_guilty(const void *row, const void *addr) diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 139615ef326b9..3751391ff11a4 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -211,6 +211,7 @@ static void print_decoded_frame_descr(const char *frame_descr) } } +/* Returns true only if the address is on the current task's stack. */ static bool __must_check get_address_stack_frame_info(const void *addr, unsigned long *offset, const char **frame_descr, @@ -224,13 +225,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr, BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); - /* - * NOTE: We currently only support printing frame information for - * accesses to the task's own stack. - */ - if (!object_is_on_stack(addr)) - return false; - aligned_addr = round_down((unsigned long)addr, sizeof(long)); mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE); shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); @@ -269,14 +263,13 @@ void kasan_print_address_stack_frame(const void *addr) const char *frame_descr; const void *frame_pc; + if (WARN_ON(!object_is_on_stack(addr))) + return; + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, &frame_pc)) return; - /* - * get_address_stack_frame_info only returns true if the given addr is - * on the current task's stack. - */ pr_err("\n"); pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", addr, current->comm, task_pid_nr(current), offset); From 16347c31890375d2a7eb9efb01d2baac28742de3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:23 -0700 Subject: [PATCH 076/113] kasan: improve stack frame info in reports - Print at least task name and id for reports affecting allocas (get_address_stack_frame_info() does not support them). - Capitalize first letter of each sentence. Link: https://lkml.kernel.org/r/aa613f097c12f7b75efb17f2618ae00480fb4bc3.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report_generic.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 3751391ff11a4..7e03cca569a7e 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -180,7 +180,7 @@ static void print_decoded_frame_descr(const char *frame_descr) return; pr_err("\n"); - pr_err("this frame has %lu %s:\n", num_objects, + pr_err("This frame has %lu %s:\n", num_objects, num_objects == 1 ? "object" : "objects"); while (num_objects--) { @@ -266,13 +266,14 @@ void kasan_print_address_stack_frame(const void *addr) if (WARN_ON(!object_is_on_stack(addr))) return; + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, &frame_pc)) return; - pr_err("\n"); - pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", - addr, current->comm, task_pid_nr(current), offset); + pr_err(" and is located at offset %lu in frame:\n", offset); pr_err(" %pS\n", frame_pc); if (!frame_descr) From 1e0f611fab9c5743719a95e09a2846e4052fd644 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:25 -0700 Subject: [PATCH 077/113] kasan: print basic stack frame info for SW_TAGS Software Tag-Based mode tags stack allocations when CONFIG_KASAN_STACK is enabled. Print task name and id in reports for stack-related bugs. [andreyknvl@google.com: include linux/sched/task_stack.h] Link: https://lkml.kernel.org/r/d7598f11a34ed96e508f7640fa038662ed2305ec.1647099922.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/029aaa87ceadde0702f3312a34697c9139c9fb53.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 2 +- mm/kasan/report_sw_tags.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d1e111b7d5d81..4447df0d7343f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -274,7 +274,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size); const char *kasan_get_bug_type(struct kasan_access_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); -#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) +#if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index d2298c3578349..7271f0988fc0d 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -51,3 +52,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr) pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); } + +#ifdef CONFIG_KASAN_STACK +void kasan_print_address_stack_frame(const void *addr) +{ + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); +} +#endif From 476b1dc2bc34febc620b8a6cd459f0b3d9950c34 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:28 -0700 Subject: [PATCH 078/113] kasan: simplify async check in end_report() Currently, end_report() does not call trace_error_report_end() for bugs detected in either async or asymm mode (when kasan_async_fault_possible() returns true), as the address of the bad access might be unknown. However, for asymm mode, the address is known for faults triggered by read operations. Instead of using kasan_async_fault_possible(), simply check that the addr is not NULL when calling trace_error_report_end(). Link: https://lkml.kernel.org/r/1c8ce43f97300300e62c941181afa2eb738965c5.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index d60ee8b81e2b4..2d892ec050beb 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -112,7 +112,7 @@ static void start_report(unsigned long *flags) static void end_report(unsigned long *flags, unsigned long addr) { - if (!kasan_async_fault_possible()) + if (addr) trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); From 3784c299eafc6dd1ca5a23f4020f18ca565e07af Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:31 -0700 Subject: [PATCH 079/113] kasan: simplify kasan_update_kunit_status() and call sites - Rename kasan_update_kunit_status() to update_kunit_status() (the function is static). - Move the IS_ENABLED(CONFIG_KUNIT) to the function's definition instead of duplicating it at call sites. - Obtain and check current->kunit_test within the function. Link: https://lkml.kernel.org/r/dac26d811ae31856c3d7666de0b108a3735d962d.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 2d892ec050beb..59db81211b8a7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -357,24 +357,31 @@ static bool report_enabled(void) } #if IS_ENABLED(CONFIG_KUNIT) -static void kasan_update_kunit_status(struct kunit *cur_test, bool sync) +static void update_kunit_status(bool sync) { + struct kunit *test; struct kunit_resource *resource; struct kunit_kasan_status *status; - resource = kunit_find_named_resource(cur_test, "kasan_status"); + test = current->kunit_test; + if (!test) + return; + resource = kunit_find_named_resource(test, "kasan_status"); if (!resource) { - kunit_set_failure(cur_test); + kunit_set_failure(test); return; } status = (struct kunit_kasan_status *)resource->data; WRITE_ONCE(status->report_found, true); WRITE_ONCE(status->sync_fault, sync); + kunit_put_resource(resource); } -#endif /* IS_ENABLED(CONFIG_KUNIT) */ +#else +static void update_kunit_status(bool sync) { } +#endif void kasan_report_invalid_free(void *object, unsigned long ip) { @@ -383,10 +390,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) object = kasan_reset_tag(object); -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test, true); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + update_kunit_status(true); start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); @@ -402,10 +406,7 @@ void kasan_report_async(void) { unsigned long flags; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test, false); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + update_kunit_status(false); start_report(&flags); pr_err("BUG: KASAN: invalid-access\n"); @@ -424,10 +425,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; -#if IS_ENABLED(CONFIG_KUNIT) - if (current->kunit_test) - kasan_update_kunit_status(current->kunit_test, true); -#endif /* IS_ENABLED(CONFIG_KUNIT) */ + update_kunit_status(true); disable_trace_on_warning(); From 49d9977ac909a98a9bbd689a64b1d872ac10215b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:34 -0700 Subject: [PATCH 080/113] kasan: check CONFIG_KASAN_KUNIT_TEST instead of CONFIG_KUNIT Check the more specific CONFIG_KASAN_KUNIT_TEST config option when defining things related to KUnit-compatible KASAN tests instead of CONFIG_KUNIT. Also put the kunit_kasan_status definition next to the definitons of other KASAN-related structs. Link: https://lkml.kernel.org/r/223592d38d2a601a160a3b2b3d5a9f9090350e62.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 18 ++++++++---------- mm/kasan/report.c | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4447df0d7343f..cc7162a9f3043 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,16 +7,6 @@ #include #include -#if IS_ENABLED(CONFIG_KUNIT) - -/* Used in KUnit-compatible KASAN tests. */ -struct kunit_kasan_status { - bool report_found; - bool sync_fault; -}; - -#endif - #ifdef CONFIG_KASAN_HW_TAGS #include @@ -224,6 +214,14 @@ struct kasan_free_meta { #endif }; +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +/* Used in KUnit-compatible KASAN tests. */ +struct kunit_kasan_status { + bool report_found; + bool sync_fault; +}; +#endif + struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 59db81211b8a7..93543157d3e16 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -356,7 +356,7 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -#if IS_ENABLED(CONFIG_KUNIT) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) static void update_kunit_status(bool sync) { struct kunit *test; From a260d2814e6d294e878840d9c6fdeca516205096 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:37 -0700 Subject: [PATCH 081/113] kasan: move update_kunit_status to start_report Instead of duplicating calls to update_kunit_status() in every error report routine, call it once in start_report(). Pass the sync flag as an additional argument to start_report(). Link: https://lkml.kernel.org/r/cae5c845a0b6f3c867014e53737cdac56b11edc7.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 75 +++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 93543157d3e16..0b6c8a14f0ea0 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -98,13 +98,40 @@ static void print_error_description(struct kasan_access_info *info) info->access_addr, current->comm, task_pid_nr(current)); } +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +static void update_kunit_status(bool sync) +{ + struct kunit *test; + struct kunit_resource *resource; + struct kunit_kasan_status *status; + + test = current->kunit_test; + if (!test) + return; + + resource = kunit_find_named_resource(test, "kasan_status"); + if (!resource) { + kunit_set_failure(test); + return; + } + + status = (struct kunit_kasan_status *)resource->data; + WRITE_ONCE(status->report_found, true); + WRITE_ONCE(status->sync_fault, sync); + + kunit_put_resource(resource); +} +#else +static void update_kunit_status(bool sync) { } +#endif + static DEFINE_SPINLOCK(report_lock); -static void start_report(unsigned long *flags) +static void start_report(unsigned long *flags, bool sync) { - /* - * Make sure we don't end up in loop. - */ + /* Update status of the currently running KASAN test. */ + update_kunit_status(sync); + /* Make sure we don't end up in loop. */ kasan_disable_current(); spin_lock_irqsave(&report_lock, *flags); pr_err("==================================================================\n"); @@ -356,33 +383,6 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -static void update_kunit_status(bool sync) -{ - struct kunit *test; - struct kunit_resource *resource; - struct kunit_kasan_status *status; - - test = current->kunit_test; - if (!test) - return; - - resource = kunit_find_named_resource(test, "kasan_status"); - if (!resource) { - kunit_set_failure(test); - return; - } - - status = (struct kunit_kasan_status *)resource->data; - WRITE_ONCE(status->report_found, true); - WRITE_ONCE(status->sync_fault, sync); - - kunit_put_resource(resource); -} -#else -static void update_kunit_status(bool sync) { } -#endif - void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; @@ -390,9 +390,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) object = kasan_reset_tag(object); - update_kunit_status(true); - - start_report(&flags); + start_report(&flags, true); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); kasan_print_tags(tag, object); pr_err("\n"); @@ -406,9 +404,7 @@ void kasan_report_async(void) { unsigned long flags; - update_kunit_status(false); - - start_report(&flags); + start_report(&flags, false); pr_err("BUG: KASAN: invalid-access\n"); pr_err("Asynchronous mode enabled: no access details available\n"); pr_err("\n"); @@ -425,9 +421,8 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; - update_kunit_status(true); - disable_trace_on_warning(); + start_report(&flags, true); tagged_addr = (void *)addr; untagged_addr = kasan_reset_tag(tagged_addr); @@ -442,8 +437,6 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, info.is_write = is_write; info.ip = ip; - start_report(&flags); - print_error_description(&info); if (addr_has_metadata(untagged_addr)) kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); From 0a6e8a07dec75f7b013b70c2df3e69c64edf312f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:40 -0700 Subject: [PATCH 082/113] kasan: move disable_trace_on_warning to start_report Move the disable_trace_on_warning() call, which enables the /proc/sys/kernel/traceoff_on_warning interface for KASAN bugs, to start_report(), so that it functions for all types of KASAN reports. Link: https://lkml.kernel.org/r/7c066c5de26234ad2cebdd931adfe437f8a95d58.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0b6c8a14f0ea0..9286ff6ae1a7b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -129,6 +129,8 @@ static DEFINE_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { + /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ + disable_trace_on_warning(); /* Update status of the currently running KASAN test. */ update_kunit_status(sync); /* Make sure we don't end up in loop. */ @@ -421,7 +423,6 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; - disable_trace_on_warning(); start_report(&flags, true); tagged_addr = (void *)addr; From 9d7b7dd946924de43021f57a8bee122ff0744d93 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:43 -0700 Subject: [PATCH 083/113] kasan: split out print_report from __kasan_report Split out the part of __kasan_report() that prints things into print_report(). One of the subsequent patches makes another error handler use print_report() as well. Includes lower-level changes: - Allow addr_has_metadata() accepting a tagged address. - Drop the const qualifier from the fields of kasan_access_info to avoid excessive type casts. - Change the type of the address argument of __kasan_report() and end_report() to void * to reduce the number of type casts. Link: https://lkml.kernel.org/r/9be3ed99dd24b9c4e1c4a848b69a0c6ecefd845e.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 7 +++--- mm/kasan/report.c | 58 +++++++++++++++++++++++++---------------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc7162a9f3043..40b863e289ec3 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -128,8 +128,8 @@ static inline bool kasan_sync_fault_possible(void) #define META_ROWS_AROUND_ADDR 2 struct kasan_access_info { - const void *access_addr; - const void *first_bad_addr; + void *access_addr; + void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; @@ -239,7 +239,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) static inline bool addr_has_metadata(const void *addr) { - return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); + return (kasan_reset_tag(addr) >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } /** diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 9286ff6ae1a7b..bb4c29b439b1c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -139,10 +139,11 @@ static void start_report(unsigned long *flags, bool sync) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags, unsigned long addr) +static void end_report(unsigned long *flags, void *addr) { if (addr) - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); + trace_error_report_end(ERROR_DETECTOR_KASAN, + (unsigned long)addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -398,7 +399,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) pr_err("\n"); print_address_description(object, tag); print_memory_metadata(object); - end_report(&flags, (unsigned long)object); + end_report(&flags, object); } #ifdef CONFIG_KASAN_HW_TAGS @@ -411,44 +412,47 @@ void kasan_report_async(void) pr_err("Asynchronous mode enabled: no access details available\n"); pr_err("\n"); dump_stack_lvl(KERN_ERR); - end_report(&flags, 0); + end_report(&flags, NULL); } #endif /* CONFIG_KASAN_HW_TAGS */ -static void __kasan_report(unsigned long addr, size_t size, bool is_write, +static void print_report(struct kasan_access_info *info) +{ + void *tagged_addr = info->access_addr; + void *untagged_addr = kasan_reset_tag(tagged_addr); + u8 tag = get_tag(tagged_addr); + + print_error_description(info); + if (addr_has_metadata(untagged_addr)) + kasan_print_tags(tag, info->first_bad_addr); + pr_err("\n"); + + if (addr_has_metadata(untagged_addr)) { + print_address_description(untagged_addr, tag); + print_memory_metadata(info->first_bad_addr); + } else { + dump_stack_lvl(KERN_ERR); + } +} + +static void __kasan_report(void *addr, size_t size, bool is_write, unsigned long ip) { struct kasan_access_info info; - void *tagged_addr; - void *untagged_addr; unsigned long flags; start_report(&flags, true); - tagged_addr = (void *)addr; - untagged_addr = kasan_reset_tag(tagged_addr); - - info.access_addr = tagged_addr; - if (addr_has_metadata(untagged_addr)) - info.first_bad_addr = - kasan_find_first_bad_addr(tagged_addr, size); + info.access_addr = addr; + if (addr_has_metadata(addr)) + info.first_bad_addr = kasan_find_first_bad_addr(addr, size); else - info.first_bad_addr = untagged_addr; + info.first_bad_addr = addr; info.access_size = size; info.is_write = is_write; info.ip = ip; - print_error_description(&info); - if (addr_has_metadata(untagged_addr)) - kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); - pr_err("\n"); - - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, get_tag(tagged_addr)); - print_memory_metadata(info.first_bad_addr); - } else { - dump_stack_lvl(KERN_ERR); - } + print_report(&info); end_report(&flags, addr); } @@ -460,7 +464,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, bool ret = false; if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); + __kasan_report((void *)addr, size, is_write, ip); ret = true; } From b91328002d266da3f53703f753b3a2af0dc9b9c1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:46 -0700 Subject: [PATCH 084/113] kasan: simplify kasan_find_first_bad_addr call sites Move the addr_has_metadata() check into kasan_find_first_bad_addr(). Link: https://lkml.kernel.org/r/a49576f7a23283d786ba61579cb0c5057e8f0b9b.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 5 +---- mm/kasan/report_generic.c | 4 ++++ mm/kasan/report_hw_tags.c | 1 + mm/kasan/report_sw_tags.c | 4 ++++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index bb4c29b439b1c..a0d4a9d3f9333 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -444,10 +444,7 @@ static void __kasan_report(void *addr, size_t size, bool is_write, start_report(&flags, true); info.access_addr = addr; - if (addr_has_metadata(addr)) - info.first_bad_addr = kasan_find_first_bad_addr(addr, size); - else - info.first_bad_addr = addr; + info.first_bad_addr = kasan_find_first_bad_addr(addr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 7e03cca569a7e..182239ca184cd 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -34,8 +34,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { void *p = addr; + if (!addr_has_metadata(p)) + return p; + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) p += KASAN_GRANULE_SIZE; + return p; } diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 5dbbbb930e7a7..f3d3be614e4b0 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,6 +17,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { + /* Return the same value regardless of whether addr_has_metadata(). */ return kasan_reset_tag(addr); } diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 7271f0988fc0d..7a26397297edb 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -36,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) void *p = kasan_reset_tag(addr); void *end = p + size; + if (!addr_has_metadata(p)) + return p; + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) p += KASAN_GRANULE_SIZE; + return p; } From b3bb1d700e51128e9ff818f76b930403b1ed2c7d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:49 -0700 Subject: [PATCH 085/113] kasan: restructure kasan_report Restructure kasan_report() to make reviewing the subsequent patches easier. Link: https://lkml.kernel.org/r/ca28042889858b8cc4724d3d4378387f90d7a59d.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index a0d4a9d3f9333..41c7966451e3c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -457,15 +457,18 @@ static void __kasan_report(void *addr, size_t size, bool is_write, bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { - unsigned long flags = user_access_save(); - bool ret = false; + unsigned long ua_flags = user_access_save(); + bool ret = true; - if (likely(report_enabled())) { - __kasan_report((void *)addr, size, is_write, ip); - ret = true; + if (unlikely(!report_enabled())) { + ret = false; + goto out; } - user_access_restore(flags); + __kasan_report((void *)addr, size, is_write, ip); + +out: + user_access_restore(ua_flags); return ret; } From be8631a17620ccf3d4fca74a6d6a218737e5b9cc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:52 -0700 Subject: [PATCH 086/113] kasan: merge __kasan_report into kasan_report Merge __kasan_report() into kasan_report(). The code is simple enough to be readable without the __kasan_report() helper. Link: https://lkml.kernel.org/r/c8a125497ef82f7042b3795918dffb81a85a878e.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 41c7966451e3c..56d5ba2355422 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -435,37 +435,31 @@ static void print_report(struct kasan_access_info *info) } } -static void __kasan_report(void *addr, size_t size, bool is_write, - unsigned long ip) -{ - struct kasan_access_info info; - unsigned long flags; - - start_report(&flags, true); - - info.access_addr = addr; - info.first_bad_addr = kasan_find_first_bad_addr(addr, size); - info.access_size = size; - info.is_write = is_write; - info.ip = ip; - - print_report(&info); - - end_report(&flags, addr); -} - bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { - unsigned long ua_flags = user_access_save(); bool ret = true; + void *ptr = (void *)addr; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + struct kasan_access_info info; if (unlikely(!report_enabled())) { ret = false; goto out; } - __kasan_report((void *)addr, size, is_write, ip); + start_report(&irq_flags, true); + + info.access_addr = ptr; + info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); + info.access_size = size; + info.is_write = is_write; + info.ip = ip; + + print_report(&info); + + end_report(&irq_flags, ptr); out: user_access_restore(ua_flags); From 31c65110b90c76153b41b9b7e22c361ff2fb4525 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:55 -0700 Subject: [PATCH 087/113] kasan: call print_report from kasan_report_invalid_free Call print_report() in kasan_report_invalid_free() instead of calling printing functions directly. Compared to the existing implementation of kasan_report_invalid_free(), print_report() makes sure that the buggy address has metadata before printing it. The change requires adding a report type field into kasan_access_info and using it accordingly. kasan_report_async() is left as is, as using print_report() will only complicate the code. Link: https://lkml.kernel.org/r/9ea6f0604c5d2e1fb28d93dc6c44232c1f8017fe.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 6 ++++++ mm/kasan/report.c | 42 ++++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 40b863e289ec3..8c9a855152c28 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -127,7 +127,13 @@ static inline bool kasan_sync_fault_possible(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 +enum kasan_report_type { + KASAN_REPORT_ACCESS, + KASAN_REPORT_INVALID_FREE, +}; + struct kasan_access_info { + enum kasan_report_type type; void *access_addr; void *first_bad_addr; size_t access_size; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 56d5ba2355422..73348f83b8130 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -86,6 +86,12 @@ __setup("kasan_multi_shot", kasan_set_multi_shot); static void print_error_description(struct kasan_access_info *info) { + if (info->type == KASAN_REPORT_INVALID_FREE) { + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", + (void *)info->ip); + return; + } + pr_err("BUG: KASAN: %s in %pS\n", kasan_get_bug_type(info), (void *)info->ip); if (info->access_size) @@ -386,22 +392,6 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -void kasan_report_invalid_free(void *object, unsigned long ip) -{ - unsigned long flags; - u8 tag = get_tag(object); - - object = kasan_reset_tag(object); - - start_report(&flags, true); - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - kasan_print_tags(tag, object); - pr_err("\n"); - print_address_description(object, tag); - print_memory_metadata(object); - end_report(&flags, object); -} - #ifdef CONFIG_KASAN_HW_TAGS void kasan_report_async(void) { @@ -435,6 +425,25 @@ static void print_report(struct kasan_access_info *info) } } +void kasan_report_invalid_free(void *ptr, unsigned long ip) +{ + unsigned long flags; + struct kasan_access_info info; + + start_report(&flags, true); + + info.type = KASAN_REPORT_INVALID_FREE; + info.access_addr = ptr; + info.first_bad_addr = kasan_reset_tag(ptr); + info.access_size = 0; + info.is_write = false; + info.ip = ip; + + print_report(&info); + + end_report(&flags, ptr); +} + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { @@ -451,6 +460,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&irq_flags, true); + info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; From bb2f967ce2130b57b49d21d4cbcd06e67f14a562 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:12:57 -0700 Subject: [PATCH 088/113] kasan: move and simplify kasan_report_async Place kasan_report_async() next to the other main reporting routines. Also simplify printed information. Link: https://lkml.kernel.org/r/52d942ef3ffd29bdfa225bbe8e327bc5bda7ab09.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 73348f83b8130..162fd2d6209e7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -392,20 +392,6 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -#ifdef CONFIG_KASAN_HW_TAGS -void kasan_report_async(void) -{ - unsigned long flags; - - start_report(&flags, false); - pr_err("BUG: KASAN: invalid-access\n"); - pr_err("Asynchronous mode enabled: no access details available\n"); - pr_err("\n"); - dump_stack_lvl(KERN_ERR); - end_report(&flags, NULL); -} -#endif /* CONFIG_KASAN_HW_TAGS */ - static void print_report(struct kasan_access_info *info) { void *tagged_addr = info->access_addr; @@ -477,6 +463,20 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, return ret; } +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) +{ + unsigned long flags; + + start_report(&flags, false); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous fault: no details available\n"); + pr_err("\n"); + dump_stack_lvl(KERN_ERR); + end_report(&flags, NULL); +} +#endif /* CONFIG_KASAN_HW_TAGS */ + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high From c965cdd67540ae8a6ac3da05cce4b7da3e0be94c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:00 -0700 Subject: [PATCH 089/113] kasan: rename kasan_access_info to kasan_report_info Rename kasan_access_info to kasan_report_info, as the latter name better reflects the struct's purpose. Link: https://lkml.kernel.org/r/158a4219a5d356901d017352558c989533a0782c.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 4 ++-- mm/kasan/report.c | 8 ++++---- mm/kasan/report_generic.c | 6 +++--- mm/kasan/report_tags.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8c9a855152c28..9d2e128eb623f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -132,7 +132,7 @@ enum kasan_report_type { KASAN_REPORT_INVALID_FREE, }; -struct kasan_access_info { +struct kasan_report_info { enum kasan_report_type type; void *access_addr; void *first_bad_addr; @@ -276,7 +276,7 @@ static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_access_info *info); +const char *kasan_get_bug_type(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); #if defined(CONFIG_KASAN_STACK) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 162fd2d6209e7..7915af8108150 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -84,7 +84,7 @@ static int __init kasan_set_multi_shot(char *str) } __setup("kasan_multi_shot", kasan_set_multi_shot); -static void print_error_description(struct kasan_access_info *info) +static void print_error_description(struct kasan_report_info *info) { if (info->type == KASAN_REPORT_INVALID_FREE) { pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", @@ -392,7 +392,7 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -static void print_report(struct kasan_access_info *info) +static void print_report(struct kasan_report_info *info) { void *tagged_addr = info->access_addr; void *untagged_addr = kasan_reset_tag(tagged_addr); @@ -414,7 +414,7 @@ static void print_report(struct kasan_access_info *info) void kasan_report_invalid_free(void *ptr, unsigned long ip) { unsigned long flags; - struct kasan_access_info info; + struct kasan_report_info info; start_report(&flags, true); @@ -437,7 +437,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, void *ptr = (void *)addr; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; - struct kasan_access_info info; + struct kasan_report_info info; if (unlikely(!report_enabled())) { ret = false; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 182239ca184cd..efc5e79a103fa 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -43,7 +43,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } -static const char *get_shadow_bug_type(struct kasan_access_info *info) +static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -95,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) return bug_type; } -static const char *get_wild_bug_type(struct kasan_access_info *info) +static const char *get_wild_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 1b41de88c53eb..e25d2166e813d 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -7,7 +7,7 @@ #include "kasan.h" #include "../slab.h" -const char *kasan_get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_report_info *info) { #ifdef CONFIG_KASAN_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; From 795b760fe741c0b82508947e7b76803a2cdabd13 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:03 -0700 Subject: [PATCH 090/113] kasan: add comment about UACCESS regions to kasan_report Add a comment explaining why kasan_report() is the only reporting function that uses user_access_save/restore(). Link: https://lkml.kernel.org/r/1201ca3c2be42c7bd077c53d2e46f4a51dd1476a.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7915af8108150..08631d8732040 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -430,6 +430,11 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip) end_report(&flags, ptr); } +/* + * kasan_report() is the only reporting function that uses + * user_access_save/restore(): kasan_report_invalid_free() cannot be called + * from a UACCESS region, and kasan_report_async() is not used on x86. + */ bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { From c068664c97c7cffa9df706e247046aa5c796efc9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:06 -0700 Subject: [PATCH 091/113] kasan: respect KASAN_BIT_REPORTED in all reporting routines Currently, only kasan_report() checks the KASAN_BIT_REPORTED and KASAN_BIT_MULTI_SHOT flags. Make other reporting routines check these flags as well. Also add explanatory comments. Note that the current->kasan_depth check is split out into report_suppressed() and only called for kasan_report(). Link: https://lkml.kernel.org/r/715e346b10b398e29ba1b425299dcd79e29d58ce.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 08631d8732040..ef649f5cee29c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -381,12 +381,26 @@ static void print_memory_metadata(const void *addr) } } -static bool report_enabled(void) +/* + * Used to suppress reports within kasan_disable/enable_current() critical + * sections, which are used for marking accesses to slab metadata. + */ +static bool report_suppressed(void) { #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) if (current->kasan_depth) - return false; + return true; #endif + return false; +} + +/* + * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot + * is enabled. Note that KASAN tests effectively enable kasan_multi_shot + * for their duration. + */ +static bool report_enabled(void) +{ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) return true; return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); @@ -416,6 +430,14 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip) unsigned long flags; struct kasan_report_info info; + /* + * Do not check report_suppressed(), as an invalid-free cannot be + * caused by accessing slab metadata and thus should not be + * suppressed by kasan_disable/enable_current() critical sections. + */ + if (unlikely(!report_enabled())) + return; + start_report(&flags, true); info.type = KASAN_REPORT_INVALID_FREE; @@ -444,7 +466,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long irq_flags; struct kasan_report_info info; - if (unlikely(!report_enabled())) { + if (unlikely(report_suppressed()) || unlikely(!report_enabled())) { ret = false; goto out; } @@ -473,6 +495,13 @@ void kasan_report_async(void) { unsigned long flags; + /* + * Do not check report_suppressed(), as kasan_disable/enable_current() + * critical sections do not affect Hardware Tag-Based KASAN. + */ + if (unlikely(!report_enabled())) + return; + start_report(&flags, false); pr_err("BUG: KASAN: invalid-access\n"); pr_err("Asynchronous fault: no details available\n"); From 865bfa28eda6893b391cd0c00b99a57cd617d1d6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:09 -0700 Subject: [PATCH 092/113] kasan: reorder reporting functions Move print_error_description()'s, report_suppressed()'s, and report_enabled()'s definitions to improve the logical order of function definitions in report.c. No functional changes. Link: https://lkml.kernel.org/r/82aa926c411e00e76e97e645a551ede9ed0c5e79.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 82 +++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ef649f5cee29c..7ef3b04556036 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -84,24 +84,29 @@ static int __init kasan_set_multi_shot(char *str) } __setup("kasan_multi_shot", kasan_set_multi_shot); -static void print_error_description(struct kasan_report_info *info) +/* + * Used to suppress reports within kasan_disable/enable_current() critical + * sections, which are used for marking accesses to slab metadata. + */ +static bool report_suppressed(void) { - if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", - (void *)info->ip); - return; - } +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + if (current->kasan_depth) + return true; +#endif + return false; +} - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); - if (info->access_size) - pr_err("%s of size %zu at addr %px by task %s/%d\n", - info->is_write ? "Write" : "Read", info->access_size, - info->access_addr, current->comm, task_pid_nr(current)); - else - pr_err("%s at addr %px by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_addr, current->comm, task_pid_nr(current)); +/* + * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot + * is enabled. Note that KASAN tests effectively enable kasan_multi_shot + * for their duration. + */ +static bool report_enabled(void) +{ + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) @@ -160,6 +165,26 @@ static void end_report(unsigned long *flags, void *addr) kasan_enable_current(); } +static void print_error_description(struct kasan_report_info *info) +{ + if (info->type == KASAN_REPORT_INVALID_FREE) { + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", + (void *)info->ip); + return; + } + + pr_err("BUG: KASAN: %s in %pS\n", + kasan_get_bug_type(info), (void *)info->ip); + if (info->access_size) + pr_err("%s of size %zu at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); + else + pr_err("%s at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_addr, current->comm, task_pid_nr(current)); +} + static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); @@ -381,31 +406,6 @@ static void print_memory_metadata(const void *addr) } } -/* - * Used to suppress reports within kasan_disable/enable_current() critical - * sections, which are used for marking accesses to slab metadata. - */ -static bool report_suppressed(void) -{ -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - if (current->kasan_depth) - return true; -#endif - return false; -} - -/* - * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot - * is enabled. Note that KASAN tests effectively enable kasan_multi_shot - * for their duration. - */ -static bool report_enabled(void) -{ - if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - return true; - return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); -} - static void print_report(struct kasan_report_info *info) { void *tagged_addr = info->access_addr; From 80207910cd71b4e0e87140d165d82b5d3ff69e53 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:12 -0700 Subject: [PATCH 093/113] kasan: move and hide kasan_save_enable/restore_multi_shot - Move kasan_save_enable/restore_multi_shot() declarations to mm/kasan/kasan.h, as there is no need for them to be visible outside of KASAN implementation. - Only define and export these functions when KASAN tests are enabled. - Move their definitions closer to other test-related code in report.c. Link: https://lkml.kernel.org/r/6ba637333b78447f027d775f2d55ab1a40f63c99.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ---- mm/kasan/kasan.h | 7 +++++++ mm/kasan/report.c | 30 +++++++++++++++++------------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe36215807f7a..ceebcb9de7bf8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -267,10 +267,6 @@ static __always_inline bool kasan_check_byte(const void *addr) return true; } - -bool kasan_save_enable_multi_shot(void); -void kasan_restore_multi_shot(bool enabled); - #else /* CONFIG_KASAN */ static inline slab_flags_t kasan_never_merge(void) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 9d2e128eb623f..d79b83d673b14 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -492,6 +492,13 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + +#endif + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7ef3b04556036..c9bfffe931b46 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -64,19 +64,6 @@ static int __init early_kasan_fault(char *arg) } early_param("kasan.fault", early_kasan_fault); -bool kasan_save_enable_multi_shot(void) -{ - return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); - -void kasan_restore_multi_shot(bool enabled) -{ - if (!enabled) - clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); - static int __init kasan_set_multi_shot(char *str) { set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -109,6 +96,23 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) static void update_kunit_status(bool sync) { From c32caa267b927b744610f4214bfde7ce7d55df1c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 24 Mar 2022 18:13:15 -0700 Subject: [PATCH 094/113] kasan: disable LOCKDEP when printing reports If LOCKDEP detects a bug while KASAN is printing a report and if panic_on_warn is set, KASAN will not be able to finish. Disable LOCKDEP while KASAN is printing a report. See https://bugzilla.kernel.org/show_bug.cgi?id=202115 for an example of the issue. Link: https://lkml.kernel.org/r/c48a2a3288200b07e1788b77365c2f02784cfeb4.1646237226.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c9bfffe931b46..199d77cce21a3 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -148,6 +149,8 @@ static void start_report(unsigned long *flags, bool sync) disable_trace_on_warning(); /* Update status of the currently running KASAN test. */ update_kunit_status(sync); + /* Do not allow LOCKDEP mangling KASAN reports. */ + lockdep_off(); /* Make sure we don't end up in loop. */ kasan_disable_current(); spin_lock_irqsave(&report_lock, *flags); @@ -160,12 +163,13 @@ static void end_report(unsigned long *flags, void *addr) trace_error_report_end(ERROR_DETECTOR_KASAN, (unsigned long)addr); pr_err("==================================================================\n"); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) panic("panic_on_warn set ...\n"); if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + lockdep_on(); kasan_enable_current(); } From 90e7e7f5ef3f712da992d505aee2d921797c3f96 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 24 Mar 2022 18:13:18 -0700 Subject: [PATCH 095/113] mm: enable MADV_DONTNEED for hugetlb mappings Patch series "Add hugetlb MADV_DONTNEED support", v3. Userfaultfd selftests for hugetlb does not perform UFFD_EVENT_REMAP testing. However, mremap support was recently added in commit 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma"). While attempting to enable mremap support in the test, it was discovered that the mremap test indirectly depends on MADV_DONTNEED. madvise does not allow MADV_DONTNEED for hugetlb mappings. However, that is primarily due to the check in can_madv_lru_vma(). By simply removing the check and adding huge page alignment, MADV_DONTNEED can be made to work for hugetlb mappings. Do note that there is no compelling use case for adding this support. This was discussed in the RFC [1]. However, adding support makes sense as it is fairly trivial and brings hugetlb functionality more in line with 'normal' memory. After enabling support, add selftest for MADV_DONTNEED as well as MADV_REMOVE. Then update userfaultfd selftest. If new functionality is accepted, then madvise man page will be updated to indicate hugetlb is supported. It will also be updated to clarify what happens to the passed length argument. This patch (of 3): MADV_DONTNEED is currently disabled for hugetlb mappings. This certainly makes sense in shared file mappings as the pagecache maintains a reference to the page and it will never be freed. However, it could be useful to unmap and free pages in private mappings. In addition, userfaultfd minor fault users may be able to simplify code by using MADV_DONTNEED. The primary thing preventing MADV_DONTNEED from working on hugetlb mappings is a check in can_madv_lru_vma(). To allow support for hugetlb mappings create and use a new routine madvise_dontneed_free_valid_vma() that allows hugetlb mappings in this specific case. For normal mappings, madvise requires the start address be PAGE aligned and rounds up length to the next multiple of PAGE_SIZE. Do similarly for hugetlb mappings: require start address be huge page size aligned and round up length to the next multiple of huge page size. Use the new madvise_dontneed_free_valid_vma routine to check alignment and round up length/end. zap_page_range requires this alignment for hugetlb vmas otherwise we will hit BUGs. Link: https://lkml.kernel.org/r/20220215002348.128823-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220215002348.128823-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Naoya Horiguchi Cc: David Hildenbrand Cc: Axel Rasmussen Cc: Mina Almasry Cc: Michal Hocko Cc: Peter Xu Cc: Andrea Arcangeli Cc: Shuah Khan Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 39b712fd83000..2e6213664c523 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -502,9 +502,14 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP)); +} + static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); + return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma); } static long madvise_cold(struct vm_area_struct *vma, @@ -777,6 +782,23 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, return 0; } +static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, + unsigned long start, + unsigned long *end, + int behavior) +{ + if (!is_vm_hugetlb_page(vma)) + return can_madv_lru_non_huge_vma(vma); + + if (behavior != MADV_DONTNEED) + return false; + if (start & ~huge_page_mask(hstate_vma(vma))) + return false; + + *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + return true; +} + static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, @@ -785,7 +807,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; *prev = vma; - if (!can_madv_lru_vma(vma)) + if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -807,7 +829,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_lru_vma(vma)) + /* + * Potential end adjustment for hugetlb vma is OK as + * the check below keeps end within vma. + */ + if (!madvise_dontneed_free_valid_vma(vma, start, &end, + behavior)) return -EINVAL; if (end > vma->vm_end) { /* From c4b6cb884011a5063722c15d14789d1ac5aad28a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 24 Mar 2022 18:13:21 -0700 Subject: [PATCH 096/113] selftests/vm: add hugetlb madvise MADV_DONTNEED MADV_REMOVE test Now that MADV_DONTNEED support for hugetlb is enabled, add corresponding tests. MADV_REMOVE has been enabled for some time, but no tests exist so add them as well. Link: https://lkml.kernel.org/r/20220215002348.128823-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Shuah Khan Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugetlb-madvise.c | 410 +++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 12 + 4 files changed, 424 insertions(+) create mode 100644 tools/testing/selftests/vm/hugetlb-madvise.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 3b5faec3c04f4..d7507f3c7c76a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -3,6 +3,7 @@ hugepage-mmap hugepage-mremap hugepage-shm hugepage-vmemmap +hugetlb-madvise khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index fbf390b51c2f4..04a49e876a46c 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -30,6 +30,7 @@ LDLIBS = -lrt -lpthread TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c new file mode 100644 index 0000000000000..6c6af40f57478 --- /dev/null +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#include +#include +#include +#include +#define __USE_GNU +#include + +#define USAGE "USAGE: %s \n" +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +/* + * default_huge_page_size copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i, tmp; + + for (i = 0; i < nr_pages; i++) + tmp += *((unsigned long *)(addr + (i * huge_page_size))); +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + if (argc != 2) { + printf(USAGE, argv[0]); + exit(1); + } + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(1); + } + + fd = open(argv[1], O_CREAT | O_RDWR, 0755); + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned up to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + unlink(argv[1]); + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index e10d50e0b8e83..1948098f431d1 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -131,6 +131,18 @@ else echo "[PASS]" fi +echo "-----------------------" +echo "running hugetlb-madvise" +echo "-----------------------" +./hugetlb-madvise $mnt/madvise-test +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi +rm -f $mnt/madvise-test + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." From 9ae8f2b849f7991cb88ba20c39cb488d0a4f7916 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 24 Mar 2022 18:13:24 -0700 Subject: [PATCH 097/113] userfaultfd/selftests: enable hugetlb remap and remove event testing With MADV_DONTNEED support added to hugetlb mappings, mremap testing can also be enabled for hugetlb. Modify the tests to use madvise MADV_DONTNEED and MADV_REMOVE instead of fallocate hole puch for releasing hugetlb pages. Link: https://lkml.kernel.org/r/20220215002348.128823-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/run_vmtests.sh | 3 +- tools/testing/selftests/vm/userfaultfd.c | 69 ++++++++++++----------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 1948098f431d1..3b265f140c25c 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -208,14 +208,13 @@ echo "running userfaultfd_hugetlb" echo "---------------------------" # Test requires source and destination huge pages. Size of source # (half_ufd_size_MB) is passed as argument to test. -./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file +./userfaultfd hugetlb $half_ufd_size_MB 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 else echo "[PASS]" fi -rm -f $mnt/ufd_test_file echo "-------------------------" echo "running userfaultfd_shmem" diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 39403b8931deb..92a4516f8f0d2 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -89,7 +89,6 @@ static bool test_uffdio_minor = false; static bool map_shared; static int shm_fd; static int huge_fd; -static char *huge_fd_off0; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; @@ -128,9 +127,9 @@ const char *examples = "./userfaultfd anon 100 99999\n\n" "# Run share memory test on 1GiB region with 99 bounces:\n" "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" - "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" - "# Run the same hugetlb test but using shmem:\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" "# 10MiB-~6GiB 999 bounces anonymous test, " "continue forever unless an error triggers\n" @@ -227,10 +226,13 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) static void hugetlb_release_pages(char *rel_area) { - if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, - nr_pages * page_size)) - err("fallocate() failed"); + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } } static void hugetlb_allocate_area(void **alloc_area) @@ -238,26 +240,37 @@ static void hugetlb_allocate_area(void **alloc_area) void *area_alias = NULL; char **alloc_area_alias; - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), - huge_fd, *alloc_area == area_src ? 0 : - nr_pages * page_size); + if (!map_shared) + *alloc_area = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | + (*alloc_area == area_src ? 0 : MAP_NORESERVE), + -1, + 0); + else + *alloc_area = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | + (*alloc_area == area_src ? 0 : MAP_NORESERVE), + huge_fd, + *alloc_area == area_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); if (map_shared) { - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_HUGETLB, - huge_fd, *alloc_area == area_src ? 0 : - nr_pages * page_size); + area_alias = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + huge_fd, + *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } if (*alloc_area == area_src) { - huge_fd_off0 = *alloc_area; alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -270,12 +283,7 @@ static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset { if (!map_shared) return; - /* - * We can't zap just the pagetable with hugetlbfs because - * MADV_DONTEED won't work. So exercise -EEXIST on a alias - * mapping where the pagetables are not established initially, - * this way we'll exercise the -EEXEC at the fs level. - */ + *start = (unsigned long) area_dst_alias + offset; } @@ -428,7 +436,6 @@ static void uffd_test_ctx_clear(void) uffd = -1; } - huge_fd_off0 = NULL; munmap_area((void **)&area_src); munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); @@ -926,10 +933,7 @@ static int faulting_process(int signal_test) struct sigaction act; unsigned long signalled = 0; - if (test_type != TEST_HUGETLB) - split_nr_pages = (nr_pages + 1) / 2; - else - split_nr_pages = nr_pages; + split_nr_pages = (nr_pages + 1) / 2; if (signal_test) { sigbuf = &jbuf; @@ -986,9 +990,6 @@ static int faulting_process(int signal_test) if (signal_test) return signalled != split_nr_pages; - if (test_type == TEST_HUGETLB) - return 0; - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); if (area_dst == MAP_FAILED) @@ -1676,7 +1677,7 @@ int main(int argc, char **argv) } nr_pages = nr_pages_per_cpu * nr_cpus; - if (test_type == TEST_HUGETLB) { + if (test_type == TEST_HUGETLB && map_shared) { if (argc < 5) usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); From 562beb7235abfebdd8366e0664a5c3d1e597b990 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 24 Mar 2022 18:13:27 -0700 Subject: [PATCH 098/113] mm/huge_memory: make is_transparent_hugepage() static It's only used inside the huge_memory.c now. Don't export it and make it static. We can thus reduce the size of huge_memory.o a bit. Without this patch: text data bss dec hex filename 32319 2965 4 35288 89d8 mm/huge_memory.o With this patch: text data bss dec hex filename 32042 2957 4 35003 88bb mm/huge_memory.o Link: https://lkml.kernel.org/r/20220302082145.12028-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Yang Shi Cc: Matthew Wilcox Cc: William Kucharski Cc: Hugh Dickins Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ------ mm/huge_memory.c | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0734aff8fa193..2999190adc220 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -183,7 +183,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); -bool is_transparent_hugepage(struct page *page); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); @@ -341,11 +340,6 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, static inline void prep_transhuge_page(struct page *page) {} -static inline bool is_transparent_hugepage(struct page *page) -{ - return false; -} - #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 035aa204ab8d5..057b13bde82da 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -533,7 +533,7 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -bool is_transparent_hugepage(struct page *page) +static inline bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) return false; @@ -542,7 +542,6 @@ bool is_transparent_hugepage(struct page *page) return is_huge_zero_page(page) || page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; } -EXPORT_SYMBOL_GPL(is_transparent_hugepage); static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, From 53a05ad9f21d858d24f76d12b3e990405f2036d1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:30 -0700 Subject: [PATCH 099/113] mm: optimize do_wp_page() for exclusive pages in the swapcache Patch series "mm: COW fixes part 1: fix the COW security issue for THP and swap", v3. This series attempts to optimize and streamline the COW logic for ordinary anon pages and THP anon pages, fixing two remaining instances of CVE-2020-29374 in do_swap_page() and do_huge_pmd_wp_page(): information can leak from a parent process to a child process via anonymous pages shared during fork(). This issue, including other related COW issues, has been summarized in [2]: "1. Observing Memory Modifications of Private Pages From A Child Process Long story short: process-private memory might not be as private as you think once you fork(): successive modifications of private memory regions in the parent process can still be observed by the child process, for example, by smart use of vmsplice()+munmap(). The core problem is that pinning pages readable in a child process, such as done via the vmsplice system call, can result in a child process observing memory modifications done in the parent process the child is not supposed to observe. [1] contains an excellent summary and [2] contains further details. This issue was assigned CVE-2020-29374 [9]. For this to trigger, it's required to use a fork() without subsequent exec(), for example, as used under Android zygote. Without further details about an application that forks less-privileged child processes, one cannot really say what's actually affected and what's not -- see the details section the end of this mail for a short sshd/openssh analysis. While commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") fixed this issue and resulted in other problems (e.g., ptrace on pmem), commit 09854ba94c6a ("mm: do_wp_page() simplification") re-introduced part of the problem unfortunately. The original reproducer can be modified quite easily to use THP [3] and make the issue appear again on upstream kernels. I modified it to use hugetlb [4] and it triggers as well. The problem is certainly less severe with hugetlb than with THP; it merely highlights that we still have plenty of open holes we should be closing/fixing. Regarding vmsplice(), the only known workaround is to disallow the vmsplice() system call ... or disable THP and hugetlb. But who knows what else is affected (RDMA? O_DIRECT?) to achieve the same goal -- in the end, it's a more generic issue" This security issue was first reported by Jann Horn on 27 May 2020 and it currently affects anonymous pages during swapin, anonymous THP and hugetlb. This series tackles anonymous pages during swapin and anonymous THP: - do_swap_page() for handling COW on PTEs during swapin directly - do_huge_pmd_wp_page() for handling COW on PMD-mapped THP during write faults With this series, we'll apply the same COW logic we have in do_wp_page() to all swappable anon pages: don't reuse (map writable) the page in case there are additional references (page_count() != 1). All users of reuse_swap_page() are remove, and consequently reuse_swap_page() is removed. In general, we're struggling with the following COW-related issues: (1) "missed COW": we miss to copy on write and reuse the page (map it writable) although we must copy because there are pending references from another process to this page. The result is a security issue. (2) "wrong COW": we copy on write although we wouldn't have to and shouldn't: if there are valid GUP references, they will become out of sync with the pages mapped into the page table. We fail to detect that such a page can be reused safely, especially if never more than a single process mapped the page. The result is an intra process memory corruption. (3) "unnecessary COW": we copy on write although we wouldn't have to: performance degradation and temporary increases swap+memory consumption can be the result. While this series fixes (1) for swappable anon pages, it tries to reduce reported cases of (3) first as good and easy as possible to limit the impact when streamlining. The individual patches try to describe in which cases we will run into (3). This series certainly makes (2) worse for THP, because a THP will now get PTE-mapped on write faults if there are additional references, even if there was only ever a single process involved: once PTE-mapped, we'll copy each and every subpage and won't reuse any subpage as long as the underlying compound page wasn't split. I'm working on an approach to fix (2) and improve (3): PageAnonExclusive to mark anon pages that are exclusive to a single process, allow GUP pins only on such exclusive pages, and allow turning exclusive pages shared (clearing PageAnonExclusive) only if there are no GUP pins. Anon pages with PageAnonExclusive set never have to be copied during write faults, but eventually during fork() if they cannot be turned shared. The improved reuse logic in this series will essentially also be the logic to reset PageAnonExclusive. This work will certainly take a while, but I'm planning on sharing details before having code fully ready. #1-#5 can be applied independently of the rest. #6-#9 are mostly only cleanups related to reuse_swap_page(). Notes: * For now, I'll leave hugetlb code untouched: "unnecessary COW" might easily break existing setups because hugetlb pages are a scarce resource and we could just end up having to crash the application when we run out of hugetlb pages. We have to be very careful and the security aspect with hugetlb is most certainly less relevant than for unprivileged anon pages. * Instead of lru_add_drain() we might actually just drain the lru_add list or even just remove the single page of interest from the lru_add list. This would require a new helper function, and could be added if the conditional lru_add_drain() turn out to be a problem. * I extended the test case already included in [1] to also test for the newly found do_swap_page() case. I'll send that out separately once/if this part was merged. [1] https://lkml.kernel.org/r/20211217113049.23850-1-david@redhat.com [2] https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com This patch (of 9): Liang Zhang reported [1] that the current COW logic in do_wp_page() is sub-optimal when it comes to swap+read fault+write fault of anonymous pages that have a single user, visible via a performance degradation in the redis benchmark. Something similar was previously reported [2] by Nadav with a simple reproducer. After we put an anon page into the swapcache and unmapped it from a single process, that process might read that page again and refault it read-only. If that process then writes to that page, the process is actually the exclusive user of the page, however, the COW logic in do_co_page() won't be able to reuse it due to the additional reference from the swapcache. Let's optimize for pages that have been added to the swapcache but only have an exclusive user. Try removing the swapcache reference if there is hope that we're the exclusive user. We will fail removing the swapcache reference in two scenarios: (1) There are additional swap entries referencing the page: copying instead of reusing is the right thing to do. (2) The page is under writeback: theoretically we might be able to reuse in some cases, however, we cannot remove the additional reference and will have to copy. Note that we'll only try removing the page from the swapcache when it's highly likely that we'll be the exclusive owner after removing the page from the swapache. As we're about to map that page writable and redirty it, that should not affect reclaim but is rather the right thing to do. Further, we might have additional references from the LRU pagevecs, which will force us to copy instead of being able to reuse. We'll try handling such references for some scenarios next. Concurrent writeback cannot be handled easily and we'll always have to copy. While at it, remove the superfluous page_mapcount() check: it's implicitly covered by the page_count() for ordinary anon pages. [1] https://lkml.kernel.org/r/20220113140318.11117-1-zhangliang5@huawei.com [2] https://lkml.kernel.org/r/0480D692-D9B2-429A-9A88-9BBA1331AC3A@gmail.com Link: https://lkml.kernel.org/r/20220131162940.210846-2-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Liang Zhang Reported-by: Nadav Amit Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: David Rientjes Cc: Shakeel Butt Cc: John Hubbard Cc: Jason Gunthorpe Cc: Mike Kravetz Cc: Mike Rapoport Cc: Yang Shi Cc: Kirill A. Shutemov Cc: Jann Horn Cc: Michal Hocko Cc: Rik van Riel Cc: Roman Gushchin Cc: Andrea Arcangeli Cc: Peter Xu Cc: Don Dutile Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6666bc20b3c61..5129f3ecc039d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,19 +3287,27 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; - /* PageKsm() doesn't necessarily raise the page refcount */ - if (PageKsm(page) || page_count(page) != 1) + /* + * We have to verify under page lock: these early checks are + * just an optimization to avoid locking the page and freeing + * the swapcache if there is little hope that we can reuse. + * + * PageKsm() doesn't necessarily raise the page refcount. + */ + if (PageKsm(page) || page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; - if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + if (PageSwapCache(page)) + try_to_free_swap(page); + if (PageKsm(page) || page_count(page) != 1) { unlock_page(page); goto copy; } /* - * Ok, we've got the only map reference, and the only - * page count reference, and the page is locked, - * it's dark out, and we're wearing sunglasses. Hit it. + * Ok, we've got the only page reference from our mapping + * and the page is locked, it's dark out, and we're wearing + * sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); From d4c470970d45c863fafc757521a82be2f80b1232 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:34 -0700 Subject: [PATCH 100/113] mm: optimize do_wp_page() for fresh pages in local LRU pagevecs For example, if a page just got swapped in via a read fault, the LRU pagevecs might still hold a reference to the page. If we trigger a write fault on such a page, the additional reference from the LRU pagevecs will prohibit reusing the page. Let's conditionally drain the local LRU pagevecs when we stumble over a !PageLRU() page. We cannot easily drain remote LRU pagevecs and it might not be desirable performance-wise. Consequently, this will only avoid copying in some cases. Add a simple "page_count(page) > 3" check first but keep the "page_count(page) > 1 + PageSwapCache(page)" check in place, as we want to minimize cases where we remove a page from the swapcache but won't be able to reuse it, for example, because another process has it mapped R/O, to not affect reclaim. We cannot easily handle the following cases and we will always have to copy: (1) The page is referenced in the LRU pagevecs of other CPUs. We really would have to drain the LRU pagevecs of all CPUs -- most probably copying is much cheaper. (2) The page is already PageLRU() but is getting moved between LRU lists, for example, for activation (e.g., mark_page_accessed()), deactivation (MADV_COLD), or lazyfree (MADV_FREE). We'd have to drain mostly unconditionally, which might be bad performance-wise. Most probably this won't happen too often in practice. Note that there are other reasons why an anon page might temporarily not be PageLRU(): for example, compaction and migration have to isolate LRU pages from the LRU lists first (isolate_lru_page()), moving them to temporary local lists and clearing PageLRU() and holding an additional reference on the page. In that case, we'll always copy. This change seems to be fairly effective with the reproducer [1] shared by Nadav, as long as writeback is done synchronously, for example, using zram. However, with asynchronous writeback, we'll usually fail to free the swapcache because the page is still under writeback: something we cannot easily optimize for, and maybe it's not really relevant in practice. [1] https://lkml.kernel.org/r/0480D692-D9B2-429A-9A88-9BBA1331AC3A@gmail.com Link: https://lkml.kernel.org/r/20220131162940.210846-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 5129f3ecc039d..c7723bdaf0c3f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3294,7 +3294,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * * PageKsm() doesn't necessarily raise the page refcount. */ - if (PageKsm(page) || page_count(page) > 1 + PageSwapCache(page)) + if (PageKsm(page) || page_count(page) > 3) + goto copy; + if (!PageLRU(page)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to PageLRU() pages. + */ + lru_add_drain(); + if (page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; From 84d60fdd3733fb86c126f2adfd0361fdc44087c3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:37 -0700 Subject: [PATCH 101/113] mm: slightly clarify KSM logic in do_swap_page() Let's make it clearer that KSM might only have to copy a page in case we have a page in the swapcache, not if we allocated a fresh page and bypassed the swapcache. While at it, add a comment why this is usually necessary and merge the two swapcache conditions. [akpm@linux-foundation.org: fix comment, per David] Link: https://lkml.kernel.org/r/20220131162940.210846-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c7723bdaf0c3f..8e30675dc0772 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3607,21 +3607,29 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - /* - * Make sure try_to_free_swap or reuse_swap_page or swapoff did not - * release the swapcache from under us. The page pin, and pte_same - * test below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not changed. - */ - if (unlikely((!PageSwapCache(page) || - page_private(page) != entry.val)) && swapcache) - goto out_page; - - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - goto out_page; + if (swapcache) { + /* + * Make sure try_to_free_swap or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!PageSwapCache(page) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } } cgroup_throttle_swaprate(page, GFP_KERNEL); From c145e0b47c77ebeefdfd73dbb344577b2fc9b065 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:40 -0700 Subject: [PATCH 102/113] mm: streamline COW logic in do_swap_page() Currently we have a different COW logic when: * triggering a read-fault to swapin first and then trigger a write-fault -> do_swap_page() + do_wp_page() * triggering a write-fault to swapin -> do_swap_page() + do_wp_page() only if we fail reuse in do_swap_page() The COW logic in do_swap_page() is different than our reuse logic in do_wp_page(). The COW logic in do_wp_page() -- page_count() == 1 -- makes currently sure that we certainly don't have a remaining reference, e.g., via GUP, on the target page we want to reuse: if there is any unexpected reference, we have to copy to avoid information leaks. As do_swap_page() behaves differently, in environments with swap enabled we can currently have an unintended information leak from the parent to the child, similar as known from CVE-2020-29374: 1. Parent writes to anonymous page -> Page is mapped writable and modified 2. Page is swapped out -> Page is unmapped and replaced by swap entry 3. fork() -> Swap entries are copied to child 4. Child pins page R/O -> Page is mapped R/O into child 5. Child unmaps page -> Child still holds GUP reference 6. Parent writes to page -> Page is reused in do_swap_page() -> Child can observe changes Exchanging 2. and 3. should have the same effect. Let's apply the same COW logic as in do_wp_page(), conditionally trying to remove the page from the swapcache after freeing the swap entry, however, before actually mapping our page. We can change the order now that we use try_to_free_swap(), which doesn't care about the mapcount, instead of reuse_swap_page(). To handle references from the LRU pagevecs, conditionally drain the local LRU pagevecs when required, however, don't consider the page_count() when deciding whether to drain to keep it simple for now. Link: https://lkml.kernel.org/r/20220131162940.210846-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 55 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8e30675dc0772..f721735ff947a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3489,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } +static inline bool should_try_to_free_swap(struct page *page, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!PageSwapCache(page)) + return false; + if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || + PageMlocked(page)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + page_count(page) == 2; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3630,6 +3649,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = swapcache; goto out_page; } + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + !PageKsm(page) && !PageLRU(page)) + lru_add_drain(); } cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3648,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. */ + swap_free(entry); + if (should_try_to_free_swap(page, vma, vmf->flags)) + try_to_free_swap(page); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + + /* + * Same logic as in do_wp_page(); however, optimize for fresh pages + * that are certainly not shared because we just allocated them without + * exposing them to the swapcache. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + (page != swapcache || page_count(page) == 1)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3686,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - swap_free(entry); - if (mem_cgroup_swap_full(page) || - (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* From 3bff7e3f1f16dc7305d12905c51c278b54970f0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:43 -0700 Subject: [PATCH 103/113] mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page() We currently have a different COW logic for anon THP than we have for ordinary anon pages in do_wp_page(): the effect is that the issue reported in CVE-2020-29374 is currently still possible for anon THP: an unintended information leak from the parent to the child. Let's apply the same logic (page_count() == 1), with similar optimizations to remove additional references first as we really want to avoid PTE-mapping the THP and copying individual pages best we can. If we end up with a page that has page_count() != 1, we'll have to PTE-map the THP and fallback to do_wp_page(), which will always copy the page. Note that KSM does not apply to THP. I. Interaction with the swapcache and writeback While a THP is in the swapcache, the swapcache holds one reference on each subpage of the THP. So with PageSwapCache() set, we expect as many additional references as we have subpages. If we manage to remove the THP from the swapcache, all these references will be gone. Usually, a THP is not split when entered into the swapcache and stays a compound page. However, try_to_unmap() will PTE-map the THP and use PTE swap entries. There are no PMD swap entries for that purpose, consequently, we always only swapin subpages into PTEs. Removing a page from the swapcache can fail either when there are remaining swap entries (in which case COW is the right thing to do) or if the page is currently under writeback. Having a locked, R/O PMD-mapped THP that is in the swapcache seems to be possible only in corner cases, for example, if try_to_unmap() failed after adding the page to the swapcache. However, it's comparatively easy to handle. As we have to fully unmap a THP before starting writeback, and swapin is always done on the PTE level, we shouldn't find a R/O PMD-mapped THP in the swapcache that is under writeback. This should at least leave writeback out of the picture. II. Interaction with GUP references Having a R/O PMD-mapped THP with GUP references (i.e., R/O references) will result in PTE-mapping the THP on a write fault. Similar to ordinary anon pages, do_wp_page() will have to copy sub-pages and result in a disconnect between the GUP references and the pages actually mapped into the page tables. To improve the situation in the future, we'll need additional handling to mark anonymous pages as definitely exclusive to a single process, only allow GUP pins on exclusive anon pages, and disallow sharing of exclusive anon pages with GUP pins e.g., during fork(). III. Interaction with references from LRU pagevecs There is no need to try draining the (local) LRU pagevecs in case we would stumble over a !PageLRU() page: folio_add_lru() and friends will always flush the affected pagevec after adding a compound page to it immediately -- pagevec_add_and_need_flush() always returns "true" for them. Note that the LRU pagevecs will hold a reference on the compound page for a very short time, between adding the page to the pagevec and draining it immediately afterwards. IV. Interaction with speculative/temporary references Similar to ordinary anon pages, other speculative/temporary references on the THP, for example, from the pagecache or page migration code, will disallow exclusive reuse of the page. We'll have to PTE-map the THP. Link: https://lkml.kernel.org/r/20220131162940.210846-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 057b13bde82da..a5f651c6df4ea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1303,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1319,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. + * See do_wp_page(): we can only map the page writable if there are + * no additional references. Note that we always drain the LRU + * pagevecs immediately after adding a THP. */ - if (reuse_swap_page(page)) { + if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + goto unlock_fallback; + if (PageSwapCache(page)) + try_to_free_swap(page); + if (page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1333,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) return VM_FAULT_WRITE; } +unlock_fallback: unlock_page(page); spin_unlock(vmf->ptl); fallback: From 363106c4cefe7d08ca67cc77f3d38d4213190f31 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:46 -0700 Subject: [PATCH 104/113] mm/khugepaged: remove reuse_swap_page() usage reuse_swap_page() currently indicates if we can write to an anon page without COW. A COW is required if the page is shared by multiple processes (either already mapped or via swap entries) or if there is concurrent writeback that cannot tolerate concurrent page modifications. However, in the context of khugepaged we're not actually going to write to a read-only mapped page, we'll copy the page content to our newly allocated THP and map that THP writable. All we have to make sure is that the read-only mapped page we're about to copy won't get reused by another process sharing the page, otherwise, page content would get modified. But that is already guaranteed via multiple mechanisms (e.g., holding a reference, holding the page lock, removing the rmap after copying the page). The swapcache handling was introduced in commit 10359213d05a ("mm: incorporate read-only pages into transparent huge pages") and it sounds like it merely wanted to mimic what do_swap_page() would do when trying to map a page obtained via the swapcache writable. As that logic is unnecessary, let's just remove it, removing the last user of reuse_swap_page(). Link: https://lkml.kernel.org/r/20220131162940.210846-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yang Shi Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/huge_memory.h | 1 - mm/khugepaged.c | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4fdb14a81108b..d651f3437367d 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -29,7 +29,6 @@ EM( SCAN_VMA_NULL, "vma_null") \ EM( SCAN_VMA_CHECK, "vma_check_failed") \ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \ - EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1cdf7c38b9e59..a4e5eaf3eb012 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,7 +46,6 @@ enum scan_result { SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, @@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out; } - if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page)) { - /* - * Page is in the swap cache and cannot be re-used. - * It cannot be collapsed into a THP. - */ - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } /* * Isolate the page to avoid collapsing an hugepage From 03104c2c5db8918030788e607e4af980b2f42bb3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:50 -0700 Subject: [PATCH 105/113] mm/swapfile: remove stale reuse_swap_page() All users are gone, let's remove it. We'll let SWP_STABLE_WRITES stick around for now, as it might come in handy in the near future. Link: https://lkml.kernel.org/r/20220131162940.210846-8-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 -- mm/swapfile.c | 104 ------------------------------------------- 2 files changed, 108 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f37837c614c59..27093b477c5fb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -515,7 +515,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -681,9 +680,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page) == 1) - static inline int try_to_free_swap(struct page *page) { return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index 33c7abb166100..63c61f8b26118 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1167,16 +1167,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return NULL; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - spin_lock(&p->lock); - return p; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { @@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, - int *total_swapcount) -{ - int i, map_swapcount, _total_swapcount; - unsigned long offset = 0; - struct swap_info_struct *si; - struct swap_cluster_info *ci = NULL; - unsigned char *map = NULL; - int swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page); - } - - page = compound_head(page); - - _total_swapcount = map_swapcount = 0; - if (PageSwapCache(page)) { - swp_entry_t entry; - - entry.val = page_private(page); - si = _swap_info_get(entry); - if (si) { - map = si->swap_map; - offset = swp_offset(entry); - } - } - if (map) - ci = lock_cluster(si, offset); - for (i = 0; i < HPAGE_PMD_NR; i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - if (map) { - swapcount = swap_count(map[offset + i]); - _total_swapcount += swapcount; - } - map_swapcount = max(map_swapcount, mapcount + swapcount); - } - unlock_cluster(ci); - - if (PageDoubleMap(page)) - map_swapcount -= 1; - - if (total_swapcount) - *total_swapcount = _total_swapcount; - - return map_swapcount + compound_mapcount(page); -} - -/* - * We can write to an anon page without COW if there are no other references - * to it. And as a side-effect, free up its swap: because the old content - * on disk will never be read, and seeking back there to write new content - * later would only waste time away from clustering. - */ -bool reuse_swap_page(struct page *page) -{ - int count, total_swapcount; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (unlikely(PageKsm(page))) - return false; - count = page_trans_huge_map_swapcount(page, &total_swapcount); - if (count == 1 && PageSwapCache(page) && - (likely(!PageTransCompound(page)) || - /* The remaining swap count will be freed soon */ - total_swapcount == page_swapcount(page))) { - if (!PageWriteback(page)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } else { - swp_entry_t entry; - struct swap_info_struct *p; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (p->flags & SWP_STABLE_WRITES) { - spin_unlock(&p->lock); - return false; - } - spin_unlock(&p->lock); - } - } - - return count <= 1; -} - /* * If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. From 55c62fa7c53368a9011cd1276c001a1469078c6a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:53 -0700 Subject: [PATCH 106/113] mm/huge_memory: remove stale page_trans_huge_mapcount() All users are gone, let's remove it. Link: https://lkml.kernel.org/r/20220131162940.210846-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 ----- mm/huge_memory.c | 48 ---------------------------------------------- 2 files changed, 53 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7a3dd7e617e4c..e34edb7753342 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,16 +834,11 @@ static inline int total_mapcount(struct page *page) return folio_mapcount(page_folio(page)); } -int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page) -{ - return page_mapcount(page); -} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a5f651c6df4ea..cc580d90117b3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2483,54 +2483,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } -/* - * This calculates accurately how many mappings a transparent hugepage - * has (unlike page_mapcount() which isn't fully accurate). This full - * accuracy is primarily needed to know if copy-on-write faults can - * reuse the page and change the mapping to read-write instead of - * copying them. At the same time this returns the total_mapcount too. - * - * The function returns the highest mapcount any one of the subpages - * has. If the return value is one, even if different processes are - * mapping different subpages of the transparent hugepage, they can - * all reuse it, because each process is reusing a different subpage. - * - * The total_mapcount is instead counting all virtual mappings of the - * subpages. If the total_mapcount is equal to "one", it tells the - * caller all mappings belong to the same "mm" and in turn the - * anon_vma of the transparent hugepage can become the vma->anon_vma - * local one as no other process may be mapping any of the subpages. - * - * It would be more accurate to replace page_mapcount() with - * page_trans_huge_mapcount(), however we only use - * page_trans_huge_mapcount() in the copy-on-write faults where we - * need full accuracy to avoid breaking page pinning, because - * page_trans_huge_mapcount() is slower than page_mapcount(). - */ -int page_trans_huge_mapcount(struct page *page) -{ - int i, ret; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (likely(!PageTransCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - page = compound_head(page); - - ret = 0; - for (i = 0; i < thp_nr_pages(page); i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - ret = max(ret, mapcount); - } - - if (PageDoubleMap(page)) - ret -= 1; - - return ret + compound_mapcount(page); -} - /* Racy check whether the huge page can be split */ bool can_split_folio(struct folio *folio, int *pextra_pins) { From 7f7609175ff2339a2cffa48ba2474ef928d7eafd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Mar 2022 18:13:56 -0700 Subject: [PATCH 107/113] mm/huge_memory: remove stale locking logic from __split_huge_pmd() Let's remove the stale logic that was required for reuse_swap_page(). [akpm@linux-foundation.org: simplification, per Yang Shi] Link: https://lkml.kernel.org/r/20220131162940.210846-10-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cc580d90117b3..2fe38212e07c6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2133,8 +2133,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool do_unlock_folio = false; - pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2153,42 +2151,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto out; } -repeat: - if (pmd_trans_huge(*pmd)) { - if (!folio) { - folio = page_folio(pmd_page(*pmd)); - /* - * An anonymous page must be locked, to ensure that a - * concurrent reuse_swap_page() sees stable mapcount; - * but reuse_swap_page() is not used on shmem or file, - * and page lock must not be taken when zap_pmd_range() - * calls __split_huge_pmd() while i_mmap_lock is held. - */ - if (folio_test_anon(folio)) { - if (unlikely(!folio_trylock(folio))) { - folio_get(folio); - _pmd = *pmd; - spin_unlock(ptl); - folio_lock(folio); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - folio_unlock(folio); - folio_put(folio); - folio = NULL; - goto repeat; - } - folio_put(folio); - } - do_unlock_folio = true; - } - } - } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) - goto out; - __split_huge_pmd_locked(vma, pmd, range.start, freeze); + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || + is_pmd_migration_entry(*pmd)) + __split_huge_pmd_locked(vma, pmd, range.start, freeze); + out: spin_unlock(ptl); - if (do_unlock_folio) - folio_unlock(folio); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): From 566d3362885aab04d6b0f885f12db3176ca3a032 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Mar 2022 18:13:59 -0700 Subject: [PATCH 108/113] mm: warn on deleting redirtied only if accounted filemap_unaccount_folio() has a WARN_ON_ONCE(folio_test_dirty(folio)). It is good to warn of late dirtying on a persistent filesystem, but late dirtying on tmpfs can only lose data which is expected to be thrown away; and it's a pity if that warning comes ONCE on tmpfs, then hides others which really matter. Make it conditional on mapping_cap_writeback(). Cleanup: then folio_account_cleaned() no longer needs to check that for itself, and so no longer needs to know the mapping. Link: https://lkml.kernel.org/r/b5a1106c-7226-a5c6-ad41-ad4832cae1f@google.com Signed-off-by: Hugh Dickins Cc: Matthew Wilcox Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 14 +++++++++----- mm/page-writeback.c | 18 ++++++++---------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 58f395f3febe5..a8d0b327b066a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1009,8 +1009,7 @@ static inline void __set_page_dirty(struct page *page, { __folio_mark_dirty(page_folio(page), mapping, warn); } -void folio_account_cleaned(struct folio *folio, struct address_space *mapping, - struct bdi_writeback *wb); +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) { diff --git a/mm/filemap.c b/mm/filemap.c index c5c169c16baed..8426434042f4d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping, /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of - * unwritten data. + * unwritten data - on ordinary filesystems. * - * This fixes dirty accounting after removing the folio entirely + * But it's harmless on in-memory filesystems like tmpfs; and can + * occur when a driver which did get_user_pages() sets page dirty + * before putting it, while the inode is being finally evicted. + * + * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(folio_test_dirty(folio))) - folio_account_cleaned(folio, mapping, - inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio) && + mapping_can_writeback(mapping))) + folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 435c026305932..7e2da284e4271 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2465,16 +2465,14 @@ static void folio_account_dirtied(struct folio *folio, * * Caller must hold lock_page_memcg(). */ -void folio_account_cleaned(struct folio *folio, struct address_space *mapping, - struct bdi_writeback *wb) +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { - if (mapping_can_writeback(mapping)) { - long nr = folio_nr_pages(folio); - lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - wb_stat_mod(wb, WB_RECLAIMABLE, -nr); - task_io_account_cancelled_write(nr * PAGE_SIZE); - } + long nr = folio_nr_pages(folio); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } /* @@ -2683,7 +2681,7 @@ void __folio_cancel_dirty(struct folio *folio) wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) - folio_account_cleaned(folio, mapping, wb); + folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); folio_memcg_unlock(folio); From 2c8659951654fc14c0351e33ca7604cdf670341e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Mar 2022 18:14:02 -0700 Subject: [PATCH 109/113] mm: unmap_mapping_range_tree() with i_mmap_rwsem shared Revert 48ec833b7851 ("Revert "mm/memory.c: share the i_mmap_rwsem"") to reinstate c8475d144abb ("mm/memory.c: share the i_mmap_rwsem"): the unmap_mapping_range family of functions do the unmapping of user pages (ultimately via zap_page_range_single) without modifying the interval tree itself, and unmapping races are necessarily guarded by page table lock, thus the i_mmap_rwsem should be shared in unmap_mapping_pages() and unmap_mapping_folio(). Commit 48ec833b7851 was intended as a short-term measure, allowing the other shared lock changes into 3.19 final, before investigating three trinity crashes, one of which had been bisected to commit c8475d144ab: [1] https://lkml.org/lkml/2014/11/14/342 https://lore.kernel.org/lkml/5466142C.60100@oracle.com/ [2] https://lkml.org/lkml/2014/12/22/213 https://lore.kernel.org/lkml/549832E2.8060609@oracle.com/ [3] https://lkml.org/lkml/2014/12/9/741 https://lore.kernel.org/lkml/5487ACC5.1010002@oracle.com/ Two of those were Bad page states: free_pages_prepare() found PG_mlocked still set - almost certain to have been fixed by 4.4 commit b87537d9e2fe ("mm: rmap use pte lock not mmap_sem to set PageMlocked"). The NULL deref on rwsem in [2]: unclear, only happened once, not bisected to c8475d144ab. No change to the i_mmap_lock_write() around __unmap_hugepage_range_final() in unmap_single_vma(): IIRC that's a special usage, helping to serialize hugetlbfs page table sharing, not to be dabbled with lightly. No change to other uses of i_mmap_lock_write() by hugetlbfs. I am not aware of any significant gains from the concurrency allowed by this commit: it is submitted more to resolve an ancient misunderstanding. Link: https://lkml.kernel.org/r/e4a5e356-6c87-47b2-3ce8-c2a95ae84e20@google.com Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Davidlohr Bueso Cc: Sasha Levin Cc: Mel Gorman Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f721735ff947a..be44d0b36b186 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3388,11 +3388,11 @@ void unmap_mapping_folio(struct folio *folio) details.even_cows = false; details.single_folio = folio; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } /** @@ -3418,11 +3418,11 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, if (last_index < first_index) last_index = ULONG_MAX; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); From 24e988c7fd1ee701ea78fd138ed309e5f5f207b4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Mar 2022 18:14:05 -0700 Subject: [PATCH 110/113] mm: generalize ARCH_HAS_FILTER_PGPROT ARCH_HAS_FILTER_PGPROT config has duplicate definitions on platforms that subscribe it. Instead make it a generic config option which can be selected on applicable platforms when required. Link: https://lkml.kernel.org/r/1643004823-16441-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 3 --- mm/Kconfig | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b037fe7be3740..10e4c332e15da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -337,9 +337,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_FILTER_PGPROT - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y diff --git a/mm/Kconfig b/mm/Kconfig index 761f5021ba511..034d879536008 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -762,6 +762,9 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. +config ARCH_HAS_FILTER_PGPROT + bool + config ARCH_HAS_PTE_DEVMAP bool From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 24 Mar 2022 18:14:09 -0700 Subject: [PATCH 111/113] mm: fix race between MADV_FREE reclaim and blkdev direct IO read Problem: ======= Userspace might read the zero-page instead of actual data from a direct IO read on a block device if the buffers have been called madvise(MADV_FREE) on earlier (this is discussed below) due to a race between page reclaim on MADV_FREE and blkdev direct IO read. - Race condition: ============== During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks if the page is not dirty, then discards its rmap PTE(s) (vs. remap back if the page is dirty). However, after try_to_unmap_one() returns to shrink_page_list(), it might keep the page _anyway_ if page_ref_freeze() fails (it expects exactly _one_ page reference, from the isolation for page reclaim). Well, blkdev_direct_IO() gets references for all pages, and on READ operations it only sets them dirty _later_. So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct IO read from block devices, and page reclaim happens during __blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages() returns, but BEFORE the pages are set dirty, the situation happens. The direct IO read eventually completes. Now, when userspace reads the buffers, the PTE is no longer there and the page fault handler do_anonymous_page() services that with the zero-page, NOT the data! A synthetic reproducer is provided. - Page faults: =========== If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't happen, because that faults-in all pages as writeable, so do_anonymous_page() sets up a new page/rmap/PTE, and that is used by direct IO. The userspace reads don't fault as the PTE is there (thus zero-page is not used/setup). But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE is no longer there; the subsequent page faults can't help: The data-read from the block device probably won't generate faults due to DMA (no MMU) but even in the case it wouldn't use DMA, that happens on different virtual addresses (not user-mapped addresses) because `struct bio_vec` stores `struct page` to figure addresses out (which are different from user-mapped addresses) for the read. Thus userspace reads (to user-mapped addresses) still fault, then do_anonymous_page() gets another `struct page` that would address/ map to other memory than the `struct page` used by `struct bio_vec` for the read. (The original `struct page` is not available, since it wasn't freed, as page_ref_freeze() failed due to more page refs. And even if it were available, its data cannot be trusted anymore.) Solution: ======== One solution is to check for the expected page reference count in try_to_unmap_one(). There should be one reference from the isolation (that is also checked in shrink_page_list() with page_ref_freeze()) plus one or more references from page mapping(s) (put in discard: label). Further references mean that rmap/PTE cannot be unmapped/nuked. (Note: there might be more than one reference from mapping due to fork()/clone() without CLONE_VM, which use the same `struct page` for references, until the copy-on-write page gets copied.) So, additional page references (e.g., from direct IO read) now prevent the rmap/PTE from being unmapped/dropped; similarly to the page is not freed per shrink_page_list()/page_ref_freeze()). - Races and Barriers: ================== The new check in try_to_unmap_one() should be safe in races with bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's done under the PTE lock. The fast path doesn't take the lock, but it checks if the PTE has changed and if so, it drops the reference and leaves the page for the slow path (which does take that lock). The fast path requires synchronization w/ full memory barrier: it writes the page reference count first then it reads the PTE later, while try_to_unmap() writes PTE first then it reads page refcount. And a second barrier is needed, as the page dirty flag should not be read before the page reference count (as in __remove_mapping()). (This can be a load memory barrier only; no writes are involved.) Call stack/comments: - try_to_unmap_one() - page_vma_mapped_walk() - map_pte() # see pte_offset_map_lock(): pte_offset_map() spin_lock() - ptep_get_and_clear() # write PTE - smp_mb() # (new barrier) GUP fast path - page_ref_count() # (new check) read refcount - page_vma_mapped_walk_done() # see pte_unmap_unlock(): pte_unmap() spin_unlock() - bio_iov_iter_get_pages() - __bio_iov_iter_get_pages() - iov_iter_get_pages() - get_user_pages_fast() - internal_get_user_pages_fast() # fast path - lockless_pages_from_mm() - gup_{pgd,p4d,pud,pmd,pte}_range() ptep = pte_offset_map() # not _lock() pte = ptep_get_lockless(ptep) page = pte_page(pte) try_grab_compound_head(page) # inc refcount # (RMW/barrier # on success) if (pte_val(pte) != pte_val(*ptep)) # read PTE put_compound_head(page) # dec refcount # go slow path # slow path - __gup_longterm_unlocked() - get_user_pages_unlocked() - __get_user_pages_locked() - __get_user_pages() - follow_{page,p4d,pud,pmd}_mask() - follow_page_pte() ptep = pte_offset_map_lock() pte = *ptep page = vm_normal_page(pte) try_grab_page(page) # inc refcount pte_unmap_unlock() - Huge Pages: ========== Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE (aka lazyfree) pages are PageAnon() && !PageSwapBacked() (madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn()) thus should reach shrink_page_list() -> split_huge_page_to_list() before try_to_unmap[_one](), so it deals with normal pages only. (And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens, which should not or be rare, the page refcount should be greater than mapcount: the head page is referenced by tail pages. That also prevents checking the head `page` then incorrectly call page_remove_rmap(subpage) for a tail page, that isn't even in the shrink_page_list()'s page_list (an effect of split huge pmd/pmvw), as it might happen today in this unlikely scenario.) MADV_FREE'd buffers: =================== So, back to the "if MADV_FREE pages are used as buffers" note. The case is arguable, and subject to multiple interpretations. The madvise(2) manual page on the MADV_FREE advice value says: 1) 'After a successful MADV_FREE ... data will be lost when the kernel frees the pages.' 2) 'the free operation will be canceled if the caller writes into the page' / 'subsequent writes ... will succeed and then [the] kernel cannot free those dirtied pages' 3) 'If there is no subsequent write, the kernel can free the pages at any time.' Thoughts, questions, considerations... respectively: 1) Since the kernel didn't actually free the page (page_ref_freeze() failed), should the data not have been lost? (on userspace read.) 2) Should writes performed by the direct IO read be able to cancel the free operation? - Should the direct IO read be considered as 'the caller' too, as it's been requested by 'the caller'? - Should the bio technique to dirty pages on return to userspace (bio_check_pages_dirty() is called/used by __blkdev_direct_IO()) be considered in another/special way here? 3) Should an upcoming write from a previously requested direct IO read be considered as a subsequent write, so the kernel should not free the pages? (as it's known at the time of page reclaim.) And lastly: Technically, the last point would seem a reasonable consideration and balance, as the madvise(2) manual page apparently (and fairly) seem to assume that 'writes' are memory access from the userspace process (not explicitly considering writes from the kernel or its corner cases; again, fairly).. plus the kernel fix implementation for the corner case of the largely 'non-atomic write' encompassed by a direct IO read operation, is relatively simple; and it helps. Reproducer: ========== @ test.c (simplified, but works) #define _GNU_SOURCE #include #include #include #include int main() { int fd, i; char *buf; fd = open(DEV, O_RDONLY | O_DIRECT); buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) buf[i] = 1; // init to non-zero madvise(buf, BUF_SIZE, MADV_FREE); read(fd, buf, BUF_SIZE); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) printf("%p: 0x%x\n", &buf[i], buf[i]); return 0; } @ block/fops.c (formerly fs/block_dev.c) +#include ... ... __blkdev_direct_IO[_simple](...) { ... + if (!strcmp(current->comm, "good")) + shrink_all_memory(ULONG_MAX); + ret = bio_iov_iter_get_pages(...); + + if (!strcmp(current->comm, "bad")) + shrink_all_memory(ULONG_MAX); ... } @ shell # NUM_PAGES=4 # PAGE_SIZE=$(getconf PAGE_SIZE) # yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES} # DEV=$(losetup -f --show test.img) # gcc -DDEV=\"$DEV\" \ -DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \ -DPAGE_SIZE=${PAGE_SIZE} \ test.c -o test # od -tx1 $DEV 0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a * 0040000 # mv test good # ./good 0x7f7c10418000: 0x79 0x7f7c10419000: 0x79 0x7f7c1041a000: 0x79 0x7f7c1041b000: 0x79 # mv good bad # ./bad 0x7fa1b8050000: 0x0 0x7fa1b8051000: 0x0 0x7fa1b8052000: 0x0 0x7fa1b8053000: 0x0 Note: the issue is consistent on v5.17-rc3, but it's intermittent with the support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c]. - v5.17-rc3: # for i in {1..1000}; do ./good; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 # mv good bad # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 4000 0x0 # free | grep Swap Swap: 0 0 0 - v4.5: # for i in {1..1000}; do ./good; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 # mv good bad # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 2702 0x0 1298 0x79 # swapoff -av swapoff /swap # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 Ceph/TCMalloc: ============= For documentation purposes, the use case driving the analysis/fix is Ceph on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to release unused memory to the system from the mmap'ed page heap (might be committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan() -> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() -> TCMalloc_SystemCommit() -> do nothing. Note: TCMalloc switched back to MADV_DONTNEED a few commits after the release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue just 'disappeared' on Ceph on later Ubuntu releases but is still present in the kernel, and can be hit by other use cases. The observed issue seems to be the old Ceph bug #22464 [1], where checksum mismatches are observed (and instrumentation with buffer dumps shows zero-pages read from mmap'ed/MADV_FREE'd page ranges). The issue in Ceph was reasonably deemed a kernel bug (comment #50) and mostly worked around with a retry mechanism, but other parts of Ceph could still hit that (rocksdb). Anyway, it's less likely to be hit again as TCMalloc switched out of MADV_FREE by default. (Some kernel versions/reports from the Ceph bug, and relation with the MADV_FREE introduction/changes; TCMalloc versions not checked.) - 4.4 good - 4.5 (madv_free: introduction) - 4.9 bad - 4.10 good? maybe a swapless system - 4.12 (madv_free: no longer free instantly on swapless systems) - 4.13 bad [1] https://tracker.ceph.com/issues/22464 Thanks: ====== Several people contributed to analysis/discussions/tests/reproducers in the first stages when drilling down on ceph/tcmalloc/linux kernel: - Dan Hill - Dan Streetman - Dongdong Tao - Gavin Guo - Gerald Yang - Heitor Alves de Siqueira - Ioanna Alifieraki - Jay Vosburgh - Matthew Ruffell - Ponnuvel Palaniyappan Reviews, suggestions, corrections, comments: - Minchan Kim - Yu Zhao - Huang, Ying - John Hubbard - Christoph Hellwig [mfo@canonical.com: v4] Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: "Huang, Ying" Cc: Minchan Kim Cc: Yu Zhao Cc: Yang Shi Cc: Miaohe Lin Cc: Dan Hill Cc: Dan Streetman Cc: Dongdong Tao Cc: Gavin Guo Cc: Gerald Yang Cc: Heitor Alves de Siqueira Cc: Ioanna Alifieraki Cc: Jay Vosburgh Cc: Matthew Ruffell Cc: Ponnuvel Palaniyappan Cc: Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index bfcc8e3d412f7..5cb970d51f0a2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { - if (!folio_test_dirty(folio)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = folio_ref_count(folio); + map_count = folio_mapcount(folio); + + /* + * Order reads for page refcount and dirty flag + * (see comments in __remove_mapping()). + */ + smp_rmb(); + + /* + * The only page refs must be one from isolation + * plus the rmap(s) (dropped by discard:). + */ + if (ref_count == 1 + map_count && + !folio_test_dirty(folio)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); From 9457056ac426e5ed0671356509c8dcce69f8dee0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 24 Mar 2022 18:14:12 -0700 Subject: [PATCH 112/113] mm: madvise: MADV_DONTNEED_LOCKED MADV_DONTNEED historically rejects mlocked ranges, but with MLOCK_ONFAULT and MCL_ONFAULT allowing to mlock without populating, there are valid use cases for depopulating locked ranges as well. Users mlock memory to protect secrets. There are allocators for secure buffers that want in-use memory generally mlocked, but cleared and invalidated memory to give up the physical pages. This could be done with explicit munlock -> mlock calls on free -> alloc of course, but that adds two unnecessary syscalls, heavy mmap_sem write locks, vma splits and re-merges - only to get rid of the backing pages. Users also mlockall(MCL_ONFAULT) to suppress sustained paging, but are okay with on-demand initial population. It seems valid to selectively free some memory during the lifetime of such a process, without having to mess with its overall policy. Why add a separate flag? Isn't this a pretty niche usecase? - MADV_DONTNEED has been bailing on locked vmas forever. It's at least conceivable that someone, somewhere is relying on mlock to protect data from perhaps broader invalidation calls. Changing this behavior now could lead to quiet data corruption. - It also clarifies expectations around MADV_FREE and maybe MADV_REMOVE. It avoids the situation where one quietly behaves different than the others. MADV_FREE_LOCKED can be added later. - The combination of mlock() and madvise() in the first place is probably niche. But where it happens, I'd say that dropping pages from a locked region once they don't contain secrets or won't page anymore is much saner than relying on mlock to protect memory from speculative or errant invalidation calls. It's just that we can't change the default behavior because of the two previous points. Given that, an explicit new flag seems to make the most sense. [hannes@cmpxchg.org: fix mips build] Link: https://lkml.kernel.org/r/20220304171912.305060-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Nadav Amit Cc: David Hildenbrand Cc: Dr. David Alan Gilbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 2 ++ arch/mips/include/uapi/asm/mman.h | 2 ++ arch/parisc/include/uapi/asm/mman.h | 2 ++ arch/xtensa/include/uapi/asm/mman.h | 2 ++ include/uapi/asm-generic/mman-common.h | 2 ++ mm/madvise.c | 24 ++++++++++++++---------- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 56b4ee5a6c9e5..4aa996423b0d1 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -74,6 +74,8 @@ #define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ #define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ +#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 40b210c65a5af..1be428663c102 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -101,6 +101,8 @@ #define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ #define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ +#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 9e3c010c0f61e..a7ea3204a5faa 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -55,6 +55,8 @@ #define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ #define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ +#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index b3a22095371b2..7966a58af472a 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -109,6 +109,8 @@ #define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ #define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ +#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 1567a3294c3db..6c1aa92a92e44 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -75,6 +75,8 @@ #define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ #define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ +#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index 2e6213664c523..b41858ee937bb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -52,6 +52,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: @@ -502,14 +503,9 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP)); -} - static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { - return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma); + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, @@ -787,10 +783,16 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long *end, int behavior) { - if (!is_vm_hugetlb_page(vma)) - return can_madv_lru_non_huge_vma(vma); + if (!is_vm_hugetlb_page(vma)) { + unsigned int forbidden = VM_PFNMAP; + + if (behavior != MADV_DONTNEED_LOCKED) + forbidden |= VM_LOCKED; + + return !(vma->vm_flags & forbidden); + } - if (behavior != MADV_DONTNEED) + if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; @@ -854,7 +856,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, VM_WARN_ON(start >= end); } - if (behavior == MADV_DONTNEED) + if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); @@ -993,6 +995,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: @@ -1123,6 +1126,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: From 25fd2d41b505d0640bdfe67aa77c549de2d3c18a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 24 Mar 2022 18:14:18 -0700 Subject: [PATCH 113/113] selftests: kselftest framework: provide "finished" helper Instead of having each time that wants to use ksft_exit() have to figure out the internals of kselftest.h, add the helper ksft_finished() that makes sure the passes, xfails, and skips are equal to the test plan count. Link: https://lkml.kernel.org/r/20220201013717.2464392-1-keescook@chromium.org Signed-off-by: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/kselftest.h | 10 ++++++++++ tools/testing/selftests/vm/memfd_secret.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index f1180987492c9..b8f248018174d 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -28,6 +28,7 @@ * * When all tests are finished, clean up and exit the program with one of: * + * ksft_finished(); * ksft_exit(condition); * ksft_exit_pass(); * ksft_exit_fail(); @@ -235,6 +236,15 @@ static inline int ksft_exit_fail(void) ksft_exit_fail(); \ } while (0) +/** + * ksft_finished() - Exit selftest with success if all tests passed + */ +#define ksft_finished() \ + ksft_exit(ksft_plan == \ + ksft_cnt.ksft_pass + \ + ksft_cnt.ksft_xfail + \ + ksft_cnt.ksft_xskip) + static inline int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c index 93e7e7ffed337..957b9e18c7295 100644 --- a/tools/testing/selftests/vm/memfd_secret.c +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -282,7 +282,7 @@ int main(int argc, char *argv[]) close(fd); - ksft_exit(!ksft_get_fail_cnt()); + ksft_finished(); } #else /* __NR_memfd_secret */