From 5d927eb0101eb791fb2d4f72b49a2da5faf01941 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 22 Jun 2005 12:37:50 -0700 Subject: [PATCH 001/199] [NETFILTER]: Fix handling of ICMP packets (RELATED) in ipt_CLUSTERIP target. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index dc4362b57cfa..9cde8c61f525 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -339,7 +339,7 @@ target(struct sk_buff **pskb, * error messages (RELATED) and information requests (see below) */ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) + || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) return IPT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 22 Jun 2005 12:38:33 -0700 Subject: [PATCH 002/199] [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP target. The patch just changes the order of structure members. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h index baa83e757156..d9bceedfb3dc 100644 --- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -18,7 +18,6 @@ struct clusterip_config; struct ipt_clusterip_tgt_info { u_int32_t flags; - struct clusterip_config *config; /* only relevant for new ones */ u_int8_t clustermac[6]; @@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info { u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; enum clusterip_hashmode hash_mode; u_int32_t hash_initval; + + struct clusterip_config *config; }; #endif /*_IPT_CLUSTERIP_H_target*/ From ef2736fc741316913a457abd3731053071c86241 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Jun 2005 13:26:03 -0700 Subject: [PATCH 003/199] [CRYPTO]: White space and coding style clean up in tcrypt Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- crypto/tcrypt.c | 333 ++++++++++++++++++++++---------------------- crypto/tcrypt.h | 357 +++++++++++++++++++++++------------------------- 2 files changed, 336 insertions(+), 354 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 92b0352c8e92..5a95b4a14c2b 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1,4 +1,4 @@ -/* +/* * Quick & dirty crypto testing module. * * This will only exist until we have a better testing mechanism @@ -9,7 +9,7 @@ * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) + * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * 14 - 09 - 2003 @@ -61,13 +61,12 @@ static char *tvmem; static char *check[] = { "des", "md5", "des3_ede", "rot13", "sha1", "sha256", "blowfish", - "twofish", "serpent", "sha384", "sha512", "md4", "aes", "cast6", - "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea", + "twofish", "serpent", "sha384", "sha512", "md4", "aes", "cast6", + "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea", "khazad", "wp512", "wp384", "wp256", "tnepres", NULL }; -static void -hexdump(unsigned char *buf, unsigned int len) +static void hexdump(unsigned char *buf, unsigned int len) { while (len--) printk("%02x", *buf++); @@ -75,29 +74,29 @@ hexdump(unsigned char *buf, unsigned int len) printk("\n"); } -static void -test_hash (char * algo, struct hash_testvec * template, unsigned int tcount) +static void test_hash(char *algo, struct hash_testvec *template, + unsigned int tcount) { - char *p; - unsigned int i, j, k, temp; - struct scatterlist sg[8]; - char result[64]; - struct crypto_tfm *tfm; - struct hash_testvec *hash_tv; - unsigned int tsize; - - printk("\ntesting %s\n", algo); - - tsize = sizeof (struct hash_testvec); + char *p; + unsigned int i, j, k, temp; + struct scatterlist sg[8]; + char result[64]; + struct crypto_tfm *tfm; + struct hash_testvec *hash_tv; + unsigned int tsize; + + printk("\ntesting %s\n", algo); + + tsize = sizeof(struct hash_testvec); tsize *= tcount; - + if (tsize > TVMEMSIZE) { printk("template (%u) too big for tvmem (%u)\n", tsize, TVMEMSIZE); return; } memcpy(tvmem, template, tsize); - hash_tv = (void *) tvmem; + hash_tv = (void *)tvmem; tfm = crypto_alloc_tfm(algo, 0); if (tfm == NULL) { printk("failed to load transform for %s\n", algo); @@ -105,70 +104,71 @@ test_hash (char * algo, struct hash_testvec * template, unsigned int tcount) } for (i = 0; i < tcount; i++) { - printk ("test %u:\n", i + 1); - memset (result, 0, 64); + printk("test %u:\n", i + 1); + memset(result, 0, 64); p = hash_tv[i].plaintext; - sg[0].page = virt_to_page (p); - sg[0].offset = offset_in_page (p); + sg[0].page = virt_to_page(p); + sg[0].offset = offset_in_page(p); sg[0].length = hash_tv[i].psize; - crypto_digest_init (tfm); + crypto_digest_init(tfm); if (tfm->crt_u.digest.dit_setkey) { - crypto_digest_setkey (tfm, hash_tv[i].key, - hash_tv[i].ksize); + crypto_digest_setkey(tfm, hash_tv[i].key, + hash_tv[i].ksize); } - crypto_digest_update (tfm, sg, 1); - crypto_digest_final (tfm, result); + crypto_digest_update(tfm, sg, 1); + crypto_digest_final(tfm, result); - hexdump (result, crypto_tfm_alg_digestsize (tfm)); + hexdump(result, crypto_tfm_alg_digestsize(tfm)); printk("%s\n", - memcmp(result, hash_tv[i].digest, - crypto_tfm_alg_digestsize(tfm)) ? "fail" : - "pass"); + memcmp(result, hash_tv[i].digest, + crypto_tfm_alg_digestsize(tfm)) ? + "fail" : "pass"); } - printk ("testing %s across pages\n", algo); + printk("testing %s across pages\n", algo); /* setup the dummy buffer first */ - memset(xbuf, 0, XBUFSIZE); + memset(xbuf, 0, XBUFSIZE); j = 0; for (i = 0; i < tcount; i++) { if (hash_tv[i].np) { j++; - printk ("test %u:\n", j); - memset (result, 0, 64); + printk("test %u:\n", j); + memset(result, 0, 64); temp = 0; for (k = 0; k < hash_tv[i].np; k++) { - memcpy (&xbuf[IDX[k]], hash_tv[i].plaintext + temp, - hash_tv[i].tap[k]); + memcpy(&xbuf[IDX[k]], + hash_tv[i].plaintext + temp, + hash_tv[i].tap[k]); temp += hash_tv[i].tap[k]; p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page (p); - sg[k].offset = offset_in_page (p); + sg[k].page = virt_to_page(p); + sg[k].offset = offset_in_page(p); sg[k].length = hash_tv[i].tap[k]; } - crypto_digest_digest (tfm, sg, hash_tv[i].np, result); - - hexdump (result, crypto_tfm_alg_digestsize (tfm)); + crypto_digest_digest(tfm, sg, hash_tv[i].np, result); + + hexdump(result, crypto_tfm_alg_digestsize(tfm)); printk("%s\n", - memcmp(result, hash_tv[i].digest, - crypto_tfm_alg_digestsize(tfm)) ? "fail" : - "pass"); + memcmp(result, hash_tv[i].digest, + crypto_tfm_alg_digestsize(tfm)) ? + "fail" : "pass"); } } - - crypto_free_tfm (tfm); + + crypto_free_tfm(tfm); } #ifdef CONFIG_CRYPTO_HMAC -static void -test_hmac(char *algo, struct hmac_testvec * template, unsigned int tcount) +static void test_hmac(char *algo, struct hmac_testvec *template, + unsigned int tcount) { char *p; unsigned int i, j, k, temp; @@ -185,8 +185,8 @@ test_hmac(char *algo, struct hmac_testvec * template, unsigned int tcount) } printk("\ntesting hmac_%s\n", algo); - - tsize = sizeof (struct hmac_testvec); + + tsize = sizeof(struct hmac_testvec); tsize *= tcount; if (tsize > TVMEMSIZE) { printk("template (%u) too big for tvmem (%u)\n", tsize, @@ -195,7 +195,7 @@ test_hmac(char *algo, struct hmac_testvec * template, unsigned int tcount) } memcpy(tvmem, template, tsize); - hmac_tv = (void *) tvmem; + hmac_tv = (void *)tvmem; for (i = 0; i < tcount; i++) { printk("test %u:\n", i + 1); @@ -219,34 +219,35 @@ test_hmac(char *algo, struct hmac_testvec * template, unsigned int tcount) printk("\ntesting hmac_%s across pages\n", algo); memset(xbuf, 0, XBUFSIZE); - + j = 0; for (i = 0; i < tcount; i++) { if (hmac_tv[i].np) { j++; - printk ("test %u:\n",j); - memset (result, 0, 64); + printk("test %u:\n",j); + memset(result, 0, 64); temp = 0; klen = hmac_tv[i].ksize; for (k = 0; k < hmac_tv[i].np; k++) { - memcpy (&xbuf[IDX[k]], hmac_tv[i].plaintext + temp, - hmac_tv[i].tap[k]); + memcpy(&xbuf[IDX[k]], + hmac_tv[i].plaintext + temp, + hmac_tv[i].tap[k]); temp += hmac_tv[i].tap[k]; p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page (p); - sg[k].offset = offset_in_page (p); + sg[k].page = virt_to_page(p); + sg[k].offset = offset_in_page(p); sg[k].length = hmac_tv[i].tap[k]; } - crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, hmac_tv[i].np, - result); + crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, + hmac_tv[i].np, result); hexdump(result, crypto_tfm_alg_digestsize(tfm)); - + printk("%s\n", - memcmp(result, hmac_tv[i].digest, - crypto_tfm_alg_digestsize(tfm)) ? "fail" : - "pass"); + memcmp(result, hmac_tv[i].digest, + crypto_tfm_alg_digestsize(tfm)) ? + "fail" : "pass"); } } out: @@ -255,8 +256,8 @@ test_hmac(char *algo, struct hmac_testvec * template, unsigned int tcount) #endif /* CONFIG_CRYPTO_HMAC */ -static void -test_cipher(char * algo, int mode, int enc, struct cipher_testvec * template, unsigned int tcount) +static void test_cipher(char *algo, int mode, int enc, + struct cipher_testvec *template, unsigned int tcount) { unsigned int ret, i, j, k, temp; unsigned int tsize; @@ -270,17 +271,17 @@ test_cipher(char * algo, int mode, int enc, struct cipher_testvec * template, un if (enc == ENCRYPT) strncpy(e, "encryption", 11); else - strncpy(e, "decryption", 11); + strncpy(e, "decryption", 11); if (mode == MODE_ECB) - strncpy(m, "ECB", 4); + strncpy(m, "ECB", 4); else - strncpy(m, "CBC", 4); + strncpy(m, "CBC", 4); - printk("\ntesting %s %s %s \n", algo, m, e); + printk("\ntesting %s %s %s\n", algo, m, e); - tsize = sizeof (struct cipher_testvec); + tsize = sizeof (struct cipher_testvec); tsize *= tcount; - + if (tsize > TVMEMSIZE) { printk("template (%u) too big for tvmem (%u)\n", tsize, TVMEMSIZE); @@ -288,112 +289,113 @@ test_cipher(char * algo, int mode, int enc, struct cipher_testvec * template, un } memcpy(tvmem, template, tsize); - cipher_tv = (void *) tvmem; + cipher_tv = (void *)tvmem; + + if (mode) + tfm = crypto_alloc_tfm(algo, 0); + else + tfm = crypto_alloc_tfm(algo, CRYPTO_TFM_MODE_CBC); - if (mode) - tfm = crypto_alloc_tfm (algo, 0); - else - tfm = crypto_alloc_tfm (algo, CRYPTO_TFM_MODE_CBC); - if (tfm == NULL) { printk("failed to load transform for %s %s\n", algo, m); return; } - + j = 0; for (i = 0; i < tcount; i++) { if (!(cipher_tv[i].np)) { - j++; + j++; printk("test %u (%d bit key):\n", j, cipher_tv[i].klen * 8); tfm->crt_flags = 0; - if (cipher_tv[i].wk) + if (cipher_tv[i].wk) tfm->crt_flags |= CRYPTO_TFM_REQ_WEAK_KEY; key = cipher_tv[i].key; - + ret = crypto_cipher_setkey(tfm, key, cipher_tv[i].klen); if (ret) { printk("setkey() failed flags=%x\n", tfm->crt_flags); - + if (!cipher_tv[i].fail) goto out; - } + } p = cipher_tv[i].input; sg[0].page = virt_to_page(p); sg[0].offset = offset_in_page(p); sg[0].length = cipher_tv[i].ilen; - + if (!mode) { crypto_cipher_set_iv(tfm, cipher_tv[i].iv, - crypto_tfm_alg_ivsize (tfm)); + crypto_tfm_alg_ivsize(tfm)); } - + if (enc) ret = crypto_cipher_encrypt(tfm, sg, sg, cipher_tv[i].ilen); else ret = crypto_cipher_decrypt(tfm, sg, sg, cipher_tv[i].ilen); - - + + if (ret) { printk("%s () failed flags=%x\n", e, tfm->crt_flags); goto out; - } - + } + q = kmap(sg[0].page) + sg[0].offset; hexdump(q, cipher_tv[i].rlen); - - printk("%s\n", - memcmp(q, cipher_tv[i].result, cipher_tv[i].rlen) ? "fail" : - "pass"); + + printk("%s\n", + memcmp(q, cipher_tv[i].result, + cipher_tv[i].rlen) ? "fail" : "pass"); } } - - printk("\ntesting %s %s %s across pages (chunking) \n", algo, m, e); + + printk("\ntesting %s %s %s across pages (chunking)\n", algo, m, e); memset(xbuf, 0, XBUFSIZE); - + j = 0; for (i = 0; i < tcount; i++) { if (cipher_tv[i].np) { - j++; + j++; printk("test %u (%d bit key):\n", j, cipher_tv[i].klen * 8); - tfm->crt_flags = 0; - if (cipher_tv[i].wk) + tfm->crt_flags = 0; + if (cipher_tv[i].wk) tfm->crt_flags |= CRYPTO_TFM_REQ_WEAK_KEY; key = cipher_tv[i].key; - - ret = crypto_cipher_setkey(tfm, key, cipher_tv[i].klen); + + ret = crypto_cipher_setkey(tfm, key, cipher_tv[i].klen); if (ret) { printk("setkey() failed flags=%x\n", tfm->crt_flags); - + if (!cipher_tv[i].fail) goto out; } temp = 0; for (k = 0; k < cipher_tv[i].np; k++) { - memcpy (&xbuf[IDX[k]], cipher_tv[i].input + temp, - cipher_tv[i].tap[k]); + memcpy(&xbuf[IDX[k]], + cipher_tv[i].input + temp, + cipher_tv[i].tap[k]); temp += cipher_tv[i].tap[k]; p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page (p); - sg[k].offset = offset_in_page (p); + sg[k].page = virt_to_page(p); + sg[k].offset = offset_in_page(p); sg[k].length = cipher_tv[i].tap[k]; } - + if (!mode) { crypto_cipher_set_iv(tfm, cipher_tv[i].iv, - crypto_tfm_alg_ivsize (tfm)); + crypto_tfm_alg_ivsize(tfm)); } - + if (enc) ret = crypto_cipher_encrypt(tfm, sg, sg, cipher_tv[i].ilen); else ret = crypto_cipher_decrypt(tfm, sg, sg, cipher_tv[i].ilen); - + if (ret) { printk("%s () failed flags=%x\n", e, tfm->crt_flags); goto out; @@ -404,9 +406,9 @@ test_cipher(char * algo, int mode, int enc, struct cipher_testvec * template, un printk("page %u\n", k); q = kmap(sg[k].page) + sg[k].offset; hexdump(q, cipher_tv[i].tap[k]); - printk("%s\n", - memcmp(q, cipher_tv[i].result + temp, - cipher_tv[i].tap[k]) ? "fail" : + printk("%s\n", + memcmp(q, cipher_tv[i].result + temp, + cipher_tv[i].tap[k]) ? "fail" : "pass"); temp += cipher_tv[i].tap[k]; } @@ -417,8 +419,7 @@ test_cipher(char * algo, int mode, int enc, struct cipher_testvec * template, un crypto_free_tfm(tfm); } -static void -test_deflate(void) +static void test_deflate(void) { unsigned int i; char result[COMP_BUF_SIZE]; @@ -436,7 +437,7 @@ test_deflate(void) } memcpy(tvmem, deflate_comp_tv_template, tsize); - tv = (void *) tvmem; + tv = (void *)tvmem; tfm = crypto_alloc_tfm("deflate", 0); if (tfm == NULL) { @@ -446,7 +447,7 @@ test_deflate(void) for (i = 0; i < DEFLATE_COMP_TEST_VECTORS; i++) { int ilen, ret, dlen = COMP_BUF_SIZE; - + printk("test %u:\n", i + 1); memset(result, 0, sizeof (result)); @@ -473,11 +474,11 @@ test_deflate(void) } memcpy(tvmem, deflate_decomp_tv_template, tsize); - tv = (void *) tvmem; + tv = (void *)tvmem; for (i = 0; i < DEFLATE_DECOMP_TEST_VECTORS; i++) { int ilen, ret, dlen = COMP_BUF_SIZE; - + printk("test %u:\n", i + 1); memset(result, 0, sizeof (result)); @@ -497,8 +498,7 @@ test_deflate(void) crypto_free_tfm(tfm); } -static void -test_crc32c(void) +static void test_crc32c(void) { #define NUMVEC 6 #define VECSIZE 40 @@ -511,7 +511,7 @@ test_crc32c(void) 0xd579c862, 0xba979ad0, 0x2b29d913 }; static u32 tot_vec_results = 0x24c5d375; - + struct scatterlist sg[NUMVEC]; struct crypto_tfm *tfm; char *fmtdata = "testing crc32c initialized to %08x: %s\n"; @@ -525,18 +525,18 @@ test_crc32c(void) printk("failed to load transform for crc32c\n"); return; } - + crypto_digest_init(tfm); crypto_digest_final(tfm, (u8*)&crc); printk(fmtdata, crc, (crc == 0) ? "pass" : "ERROR"); - + /* * stuff test_vec with known values, simple incrementing * byte values. */ b = 0; for (i = 0; i < NUMVEC; i++) { - for (j = 0; j < VECSIZE; j++) + for (j = 0; j < VECSIZE; j++) test_vec[i][j] = ++b; sg[i].page = virt_to_page(test_vec[i]); sg[i].offset = offset_in_page(test_vec[i]); @@ -548,11 +548,11 @@ test_crc32c(void) crypto_digest_final(tfm, (u8*)&crc); printk("testing crc32c setkey returns %08x : %s\n", crc, (crc == (SEEDTESTVAL ^ ~(u32)0)) ? "pass" : "ERROR"); - + printk("testing crc32c using update/final:\n"); pass = 1; /* assume all is well */ - + for (i = 0; i < NUMVEC; i++) { seed = ~(u32)0; (void)crypto_digest_setkey(tfm, (const u8*)&seed, sizeof(u32)); @@ -591,66 +591,64 @@ test_crc32c(void) printk(" %08x:BAD, wanted %08x\n", crc, tot_vec_results); pass = 0; } - + printk("\n%s\n", pass ? "pass" : "ERROR"); crypto_free_tfm(tfm); printk("crc32c test complete\n"); } -static void -test_available(void) +static void test_available(void) { char **name = check; - + while (*name) { printk("alg %s ", *name); printk((crypto_alg_available(*name, 0)) ? "found\n" : "not found\n"); name++; - } + } } -static void -do_test(void) +static void do_test(void) { switch (mode) { case 0: test_hash("md5", md5_tv_template, MD5_TEST_VECTORS); - + test_hash("sha1", sha1_tv_template, SHA1_TEST_VECTORS); - + //DES test_cipher ("des", MODE_ECB, ENCRYPT, des_enc_tv_template, DES_ENC_TEST_VECTORS); - test_cipher ("des", MODE_ECB, DECRYPT, des_dec_tv_template, DES_DEC_TEST_VECTORS); - test_cipher ("des", MODE_CBC, ENCRYPT, des_cbc_enc_tv_template, DES_CBC_ENC_TEST_VECTORS); - test_cipher ("des", MODE_CBC, DECRYPT, des_cbc_dec_tv_template, DES_CBC_DEC_TEST_VECTORS); - + test_cipher ("des", MODE_ECB, DECRYPT, des_dec_tv_template, DES_DEC_TEST_VECTORS); + test_cipher ("des", MODE_CBC, ENCRYPT, des_cbc_enc_tv_template, DES_CBC_ENC_TEST_VECTORS); + test_cipher ("des", MODE_CBC, DECRYPT, des_cbc_dec_tv_template, DES_CBC_DEC_TEST_VECTORS); + //DES3_EDE test_cipher ("des3_ede", MODE_ECB, ENCRYPT, des3_ede_enc_tv_template, DES3_EDE_ENC_TEST_VECTORS); - test_cipher ("des3_ede", MODE_ECB, DECRYPT, des3_ede_dec_tv_template, DES3_EDE_DEC_TEST_VECTORS); - + test_cipher ("des3_ede", MODE_ECB, DECRYPT, des3_ede_dec_tv_template, DES3_EDE_DEC_TEST_VECTORS); + test_hash("md4", md4_tv_template, MD4_TEST_VECTORS); - + test_hash("sha256", sha256_tv_template, SHA256_TEST_VECTORS); - + //BLOWFISH test_cipher ("blowfish", MODE_ECB, ENCRYPT, bf_enc_tv_template, BF_ENC_TEST_VECTORS); test_cipher ("blowfish", MODE_ECB, DECRYPT, bf_dec_tv_template, BF_DEC_TEST_VECTORS); test_cipher ("blowfish", MODE_CBC, ENCRYPT, bf_cbc_enc_tv_template, BF_CBC_ENC_TEST_VECTORS); test_cipher ("blowfish", MODE_CBC, DECRYPT, bf_cbc_dec_tv_template, BF_CBC_DEC_TEST_VECTORS); - + //TWOFISH test_cipher ("twofish", MODE_ECB, ENCRYPT, tf_enc_tv_template, TF_ENC_TEST_VECTORS); test_cipher ("twofish", MODE_ECB, DECRYPT, tf_dec_tv_template, TF_DEC_TEST_VECTORS); test_cipher ("twofish", MODE_CBC, ENCRYPT, tf_cbc_enc_tv_template, TF_CBC_ENC_TEST_VECTORS); test_cipher ("twofish", MODE_CBC, DECRYPT, tf_cbc_dec_tv_template, TF_CBC_DEC_TEST_VECTORS); - + //SERPENT test_cipher ("serpent", MODE_ECB, ENCRYPT, serpent_enc_tv_template, SERPENT_ENC_TEST_VECTORS); test_cipher ("serpent", MODE_ECB, DECRYPT, serpent_dec_tv_template, SERPENT_DEC_TEST_VECTORS); - + //TNEPRES test_cipher ("tnepres", MODE_ECB, ENCRYPT, tnepres_enc_tv_template, TNEPRES_ENC_TEST_VECTORS); test_cipher ("tnepres", MODE_ECB, DECRYPT, tnepres_dec_tv_template, TNEPRES_DEC_TEST_VECTORS); @@ -662,7 +660,7 @@ do_test(void) //CAST5 test_cipher ("cast5", MODE_ECB, ENCRYPT, cast5_enc_tv_template, CAST5_ENC_TEST_VECTORS); test_cipher ("cast5", MODE_ECB, DECRYPT, cast5_dec_tv_template, CAST5_DEC_TEST_VECTORS); - + //CAST6 test_cipher ("cast6", MODE_ECB, ENCRYPT, cast6_enc_tv_template, CAST6_ENC_TEST_VECTORS); test_cipher ("cast6", MODE_ECB, DECRYPT, cast6_dec_tv_template, CAST6_DEC_TEST_VECTORS); @@ -702,9 +700,9 @@ do_test(void) test_crc32c(); #ifdef CONFIG_CRYPTO_HMAC test_hmac("md5", hmac_md5_tv_template, HMAC_MD5_TEST_VECTORS); - test_hmac("sha1", hmac_sha1_tv_template, HMAC_SHA1_TEST_VECTORS); + test_hmac("sha1", hmac_sha1_tv_template, HMAC_SHA1_TEST_VECTORS); test_hmac("sha256", hmac_sha256_tv_template, HMAC_SHA256_TEST_VECTORS); -#endif +#endif test_hash("michael_mic", michael_mic_tv_template, MICHAEL_MIC_TEST_VECTORS); break; @@ -726,17 +724,17 @@ do_test(void) case 4: test_cipher ("des3_ede", MODE_ECB, ENCRYPT, des3_ede_enc_tv_template, DES3_EDE_ENC_TEST_VECTORS); - test_cipher ("des3_ede", MODE_ECB, DECRYPT, des3_ede_dec_tv_template, DES3_EDE_DEC_TEST_VECTORS); + test_cipher ("des3_ede", MODE_ECB, DECRYPT, des3_ede_dec_tv_template, DES3_EDE_DEC_TEST_VECTORS); break; case 5: test_hash("md4", md4_tv_template, MD4_TEST_VECTORS); break; - + case 6: test_hash("sha256", sha256_tv_template, SHA256_TEST_VECTORS); break; - + case 7: test_cipher ("blowfish", MODE_ECB, ENCRYPT, bf_enc_tv_template, BF_ENC_TEST_VECTORS); test_cipher ("blowfish", MODE_ECB, DECRYPT, bf_dec_tv_template, BF_DEC_TEST_VECTORS); @@ -750,7 +748,7 @@ do_test(void) test_cipher ("twofish", MODE_CBC, ENCRYPT, tf_cbc_enc_tv_template, TF_CBC_ENC_TEST_VECTORS); test_cipher ("twofish", MODE_CBC, DECRYPT, tf_cbc_dec_tv_template, TF_CBC_DEC_TEST_VECTORS); break; - + case 9: test_cipher ("serpent", MODE_ECB, ENCRYPT, serpent_enc_tv_template, SERPENT_ENC_TEST_VECTORS); test_cipher ("serpent", MODE_ECB, DECRYPT, serpent_dec_tv_template, SERPENT_DEC_TEST_VECTORS); @@ -758,13 +756,13 @@ do_test(void) case 10: test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS); - test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS); + test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS); break; case 11: test_hash("sha384", sha384_tv_template, SHA384_TEST_VECTORS); break; - + case 12: test_hash("sha512", sha512_tv_template, SHA512_TEST_VECTORS); break; @@ -852,11 +850,11 @@ do_test(void) case 100: test_hmac("md5", hmac_md5_tv_template, HMAC_MD5_TEST_VECTORS); break; - + case 101: - test_hmac("sha1", hmac_sha1_tv_template, HMAC_SHA1_TEST_VECTORS); + test_hmac("sha1", hmac_sha1_tv_template, HMAC_SHA1_TEST_VECTORS); break; - + case 102: test_hmac("sha256", hmac_sha256_tv_template, HMAC_SHA256_TEST_VECTORS); break; @@ -866,7 +864,7 @@ do_test(void) case 1000: test_available(); break; - + default: /* useful for debugging */ printk("not testing anything\n"); @@ -874,8 +872,7 @@ do_test(void) } } -static int __init -init(void) +static int __init init(void) { tvmem = kmalloc(TVMEMSIZE, GFP_KERNEL); if (tvmem == NULL) diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h index a3097afae593..72d40704042f 100644 --- a/crypto/tcrypt.h +++ b/crypto/tcrypt.h @@ -9,7 +9,7 @@ * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) + * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * 14 - 09 - 2003 Changes by Kartikey Mahendra Bhatt @@ -29,19 +29,19 @@ struct hash_testvec { unsigned char psize; char digest[MAX_DIGEST_SIZE]; unsigned char np; - unsigned char tap[MAX_TAP]; + unsigned char tap[MAX_TAP]; char key[128]; /* only used with keyed hash algorithms */ unsigned char ksize; }; -struct hmac_testvec { +struct hmac_testvec { char key[128]; unsigned char ksize; char plaintext[128]; unsigned char psize; char digest[MAX_DIGEST_SIZE]; unsigned char np; - unsigned char tap[MAX_TAP]; + unsigned char tap[MAX_TAP]; }; struct cipher_testvec { @@ -55,7 +55,7 @@ struct cipher_testvec { char result[48]; unsigned char rlen; int np; - unsigned char tap[MAX_TAP]; + unsigned char tap[MAX_TAP]; }; /* @@ -155,7 +155,7 @@ static struct hash_testvec md5_tv_template[] = { #define SHA1_TEST_VECTORS 2 static struct hash_testvec sha1_tv_template[] = { - { + { .plaintext = "abc", .psize = 3, .digest = { 0xa9, 0x99, 0x3e, 0x36, 0x47, 0x06, 0x81, 0x6a, 0xba, 0x3e, @@ -175,8 +175,8 @@ static struct hash_testvec sha1_tv_template[] = { */ #define SHA256_TEST_VECTORS 2 -static struct hash_testvec sha256_tv_template[] = { - { +static struct hash_testvec sha256_tv_template[] = { + { .plaintext = "abc", .psize = 3, .digest = { 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea, @@ -201,7 +201,7 @@ static struct hash_testvec sha256_tv_template[] = { #define SHA384_TEST_VECTORS 4 static struct hash_testvec sha384_tv_template[] = { - { + { .plaintext= "abc", .psize = 3, .digest = { 0xcb, 0x00, 0x75, 0x3f, 0x45, 0xa3, 0x5e, 0x8b, @@ -221,7 +221,7 @@ static struct hash_testvec sha384_tv_template[] = { 0x5f, 0xe9, 0x5b, 0x1f, 0xe3, 0xc8, 0x45, 0x2b}, }, { .plaintext = "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn" - "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", + "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", .psize = 112, .digest = { 0x09, 0x33, 0x0c, 0x33, 0xf7, 0x11, 0x47, 0xe8, 0x3d, 0x19, 0x2f, 0xc7, 0x82, 0xcd, 0x1b, 0x47, @@ -250,7 +250,7 @@ static struct hash_testvec sha384_tv_template[] = { #define SHA512_TEST_VECTORS 4 static struct hash_testvec sha512_tv_template[] = { - { + { .plaintext = "abc", .psize = 3, .digest = { 0xdd, 0xaf, 0x35, 0xa1, 0x93, 0x61, 0x7a, 0xba, @@ -303,14 +303,14 @@ static struct hash_testvec sha512_tv_template[] = { /* - * WHIRLPOOL test vectors from Whirlpool package + * WHIRLPOOL test vectors from Whirlpool package * by Vincent Rijmen and Paulo S. L. M. Barreto as part of the NESSIE * submission */ #define WP512_TEST_VECTORS 8 static struct hash_testvec wp512_tv_template[] = { - { + { .plaintext = "", .psize = 0, .digest = { 0x19, 0xFA, 0x61, 0xD7, 0x55, 0x22, 0xA4, 0x66, @@ -348,13 +348,13 @@ static struct hash_testvec wp512_tv_template[] = { }, { .plaintext = "message digest", .psize = 14, - .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, - 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, - 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, - 0xE1, 0xF5, 0x70, 0x0C, 0x0F, 0xFB, 0x4D, 0x3B, - 0x84, 0x21, 0x55, 0x76, 0x59, 0xEF, 0x55, 0xC1, - 0x06, 0xB4, 0xB5, 0x2A, 0xC5, 0xA4, 0xAA, 0xA6, - 0x92, 0xED, 0x92, 0x00, 0x52, 0x83, 0x8F, 0x33, + .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, + 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, + 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, + 0xE1, 0xF5, 0x70, 0x0C, 0x0F, 0xFB, 0x4D, 0x3B, + 0x84, 0x21, 0x55, 0x76, 0x59, 0xEF, 0x55, 0xC1, + 0x06, 0xB4, 0xB5, 0x2A, 0xC5, 0xA4, 0xAA, 0xA6, + 0x92, 0xED, 0x92, 0x00, 0x52, 0x83, 0x8F, 0x33, 0x62, 0xE8, 0x6D, 0xBD, 0x37, 0xA8, 0x90, 0x3E }, }, { .plaintext = "abcdefghijklmnopqrstuvwxyz", @@ -394,7 +394,7 @@ static struct hash_testvec wp512_tv_template[] = { }, { .plaintext = "abcdbcdecdefdefgefghfghighijhijk", .psize = 32, - .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, + .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, 0xF5, 0xD6, 0xF0, 0xA0, 0xE4, 0x64, 0x4F, 0x48, 0x8A, 0x7A, 0x5A, 0x52, 0xDE, 0xEE, 0x65, 0x62, 0x07, 0xC5, 0x62, 0xF9, 0x88, 0xE9, 0x5C, 0x69, @@ -408,7 +408,7 @@ static struct hash_testvec wp512_tv_template[] = { #define WP384_TEST_VECTORS 8 static struct hash_testvec wp384_tv_template[] = { - { + { .plaintext = "", .psize = 0, .digest = { 0x19, 0xFA, 0x61, 0xD7, 0x55, 0x22, 0xA4, 0x66, @@ -440,11 +440,11 @@ static struct hash_testvec wp384_tv_template[] = { }, { .plaintext = "message digest", .psize = 14, - .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, - 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, - 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, - 0xE1, 0xF5, 0x70, 0x0C, 0x0F, 0xFB, 0x4D, 0x3B, - 0x84, 0x21, 0x55, 0x76, 0x59, 0xEF, 0x55, 0xC1, + .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, + 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, + 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, + 0xE1, 0xF5, 0x70, 0x0C, 0x0F, 0xFB, 0x4D, 0x3B, + 0x84, 0x21, 0x55, 0x76, 0x59, 0xEF, 0x55, 0xC1, 0x06, 0xB4, 0xB5, 0x2A, 0xC5, 0xA4, 0xAA, 0xA6 }, }, { .plaintext = "abcdefghijklmnopqrstuvwxyz", @@ -478,7 +478,7 @@ static struct hash_testvec wp384_tv_template[] = { }, { .plaintext = "abcdbcdecdefdefgefghfghighijhijk", .psize = 32, - .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, + .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, 0xF5, 0xD6, 0xF0, 0xA0, 0xE4, 0x64, 0x4F, 0x48, 0x8A, 0x7A, 0x5A, 0x52, 0xDE, 0xEE, 0x65, 0x62, 0x07, 0xC5, 0x62, 0xF9, 0x88, 0xE9, 0x5C, 0x69, @@ -490,7 +490,7 @@ static struct hash_testvec wp384_tv_template[] = { #define WP256_TEST_VECTORS 8 static struct hash_testvec wp256_tv_template[] = { - { + { .plaintext = "", .psize = 0, .digest = { 0x19, 0xFA, 0x61, 0xD7, 0x55, 0x22, 0xA4, 0x66, @@ -516,9 +516,9 @@ static struct hash_testvec wp256_tv_template[] = { }, { .plaintext = "message digest", .psize = 14, - .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, - 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, - 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, + .digest = { 0x37, 0x8C, 0x84, 0xA4, 0x12, 0x6E, 0x2D, 0xC6, + 0xE5, 0x6D, 0xCC, 0x74, 0x58, 0x37, 0x7A, 0xAC, + 0x83, 0x8D, 0x00, 0x03, 0x22, 0x30, 0xF5, 0x3C, 0xE1, 0xF5, 0x70, 0x0C, 0x0F, 0xFB, 0x4D, 0x3B }, }, { .plaintext = "abcdefghijklmnopqrstuvwxyz", @@ -546,7 +546,7 @@ static struct hash_testvec wp256_tv_template[] = { }, { .plaintext = "abcdbcdecdefdefgefghfghighijhijk", .psize = 32, - .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, + .digest = { 0x2A, 0x98, 0x7E, 0xA4, 0x0F, 0x91, 0x70, 0x61, 0xF5, 0xD6, 0xF0, 0xA0, 0xE4, 0x64, 0x4F, 0x48, 0x8A, 0x7A, 0x5A, 0x52, 0xDE, 0xEE, 0x65, 0x62, 0x07, 0xC5, 0x62, 0xF9, 0x88, 0xE9, 0x5C, 0x69 }, @@ -554,7 +554,7 @@ static struct hash_testvec wp256_tv_template[] = { }; /* - * TIGER test vectors from Tiger website + * TIGER test vectors from Tiger website */ #define TGR192_TEST_VECTORS 6 @@ -693,7 +693,7 @@ static struct hash_testvec tgr128_tv_template[] = { #define HMAC_MD5_TEST_VECTORS 7 static struct hmac_testvec hmac_md5_tv_template[] = -{ +{ { .key = { [0 ... 15] = 0x0b }, .ksize = 16, @@ -756,7 +756,7 @@ static struct hmac_testvec hmac_md5_tv_template[] = */ #define HMAC_SHA1_TEST_VECTORS 7 -static struct hmac_testvec hmac_sha1_tv_template[] = { +static struct hmac_testvec hmac_sha1_tv_template[] = { { .key = { [0 ... 19] = 0x0b }, .ksize = 20, @@ -766,11 +766,11 @@ static struct hmac_testvec hmac_sha1_tv_template[] = { 0xe2, 0x8b, 0xc0, 0xb6, 0xfb, 0x37, 0x8c, 0x8e, 0xf1, 0x46, 0xbe }, }, { - .key = { 'J', 'e', 'f', 'e' }, + .key = { 'J', 'e', 'f', 'e' }, .ksize = 4, .plaintext = "what do ya want for nothing?", .psize = 28, - .digest = { 0xef, 0xfc, 0xdf, 0x6a, 0xe5, 0xeb, 0x2f, 0xa2, 0xd2, 0x74, + .digest = { 0xef, 0xfc, 0xdf, 0x6a, 0xe5, 0xeb, 0x2f, 0xa2, 0xd2, 0x74, 0x16, 0xd5, 0xf1, 0x84, 0xdf, 0x9c, 0x25, 0x9a, 0x7c, 0x79 }, .np = 2, .tap = { 14, 14 } @@ -779,30 +779,30 @@ static struct hmac_testvec hmac_sha1_tv_template[] = { .ksize = 20, .plaintext = { [0 ... 49] = 0xdd }, .psize = 50, - .digest = { 0x12, 0x5d, 0x73, 0x42, 0xb9, 0xac, 0x11, 0xcd, 0x91, 0xa3, + .digest = { 0x12, 0x5d, 0x73, 0x42, 0xb9, 0xac, 0x11, 0xcd, 0x91, 0xa3, 0x9a, 0xf4, 0x8a, 0xa1, 0x7b, 0x4f, 0x63, 0xf1, 0x75, 0xd3 }, }, { .key = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, - 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19 }, .ksize = 25, .plaintext = { [0 ... 49] = 0xcd }, .psize = 50, - .digest = { 0x4c, 0x90, 0x07, 0xf4, 0x02, 0x62, 0x50, 0xc6, 0xbc, 0x84, + .digest = { 0x4c, 0x90, 0x07, 0xf4, 0x02, 0x62, 0x50, 0xc6, 0xbc, 0x84, 0x14, 0xf9, 0xbf, 0x50, 0xc8, 0x6c, 0x2d, 0x72, 0x35, 0xda }, }, { .key = { [0 ... 19] = 0x0c }, .ksize = 20, .plaintext = "Test With Truncation", .psize = 20, - .digest = { 0x4c, 0x1a, 0x03, 0x42, 0x4b, 0x55, 0xe0, 0x7f, 0xe7, 0xf2, + .digest = { 0x4c, 0x1a, 0x03, 0x42, 0x4b, 0x55, 0xe0, 0x7f, 0xe7, 0xf2, 0x7b, 0xe1, 0xd5, 0x8b, 0xb9, 0x32, 0x4a, 0x9a, 0x5a, 0x04 }, }, { .key = { [0 ... 79] = 0xaa }, .ksize = 80, .plaintext = "Test Using Larger Than Block-Size Key - Hash Key First", .psize = 54, - .digest = { 0xaa, 0x4a, 0xe5, 0xe1, 0x52, 0x72, 0xd0, 0x0e, 0x95, 0x70, + .digest = { 0xaa, 0x4a, 0xe5, 0xe1, 0x52, 0x72, 0xd0, 0x0e, 0x95, 0x70, 0x56, 0x37, 0xce, 0x8a, 0x3b, 0x55, 0xed, 0x40, 0x21, 0x12 }, }, { .key = { [0 ... 79] = 0xaa }, @@ -810,7 +810,7 @@ static struct hmac_testvec hmac_sha1_tv_template[] = { .plaintext = "Test Using Larger Than Block-Size Key and Larger Than One " "Block-Size Data", .psize = 73, - .digest = { 0xe8, 0xe9, 0x9d, 0x0f, 0x45, 0x23, 0x7d, 0x78, 0x6d, 0x6b, + .digest = { 0xe8, 0xe9, 0x9d, 0x0f, 0x45, 0x23, 0x7d, 0x78, 0x6d, 0x6b, 0xba, 0xa7, 0x96, 0x5c, 0x78, 0x08, 0xbb, 0xff, 0x1a, 0x91 }, }, }; @@ -1011,7 +1011,7 @@ static struct cipher_testvec des_enc_tv_template[] = { 0xf7, 0x9c, 0x89, 0x2a, 0x33, 0x8f, 0x4a, 0x8b }, .rlen = 32, .np = 3, - .tap = { 14, 10, 8 } + .tap = { 14, 10, 8 } }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, @@ -1024,7 +1024,7 @@ static struct cipher_testvec des_enc_tv_template[] = { 0xb4, 0x99, 0x26, 0xf7, 0x1f, 0xe1, 0xd4, 0x90 }, .rlen = 24, .np = 4, - .tap = { 2, 1, 3, 18 } + .tap = { 2, 1, 3, 18 } }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, @@ -1035,7 +1035,7 @@ static struct cipher_testvec des_enc_tv_template[] = { 0xf7, 0x9c, 0x89, 0x2a, 0x33, 0x8f, 0x4a, 0x8b }, .rlen = 16, .np = 5, - .tap = { 2, 2, 2, 2, 8 } + .tap = { 2, 2, 2, 2, 8 } }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, @@ -1044,7 +1044,7 @@ static struct cipher_testvec des_enc_tv_template[] = { .result = { 0xc9, 0x57, 0x44, 0x25, 0x6a, 0x5e, 0xd3, 0x1d }, .rlen = 8, .np = 8, - .tap = { 1, 1, 1, 1, 1, 1, 1, 1 } + .tap = { 1, 1, 1, 1, 1, 1, 1, 1 } }, }; @@ -1057,7 +1057,7 @@ static struct cipher_testvec des_dec_tv_template[] = { .result = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xe7 }, .rlen = 8, }, { /* Sbox test from NBS */ - .key = { 0x7c, 0xa1, 0x10, 0x45, 0x4a, 0x1a, 0x6e, 0x57 }, + .key = { 0x7c, 0xa1, 0x10, 0x45, 0x4a, 0x1a, 0x6e, 0x57 }, .klen = 8, .input = { 0x69, 0x0f, 0x5b, 0x0d, 0x9a, 0x26, 0x93, 0x9b }, .ilen = 8, @@ -1092,19 +1092,19 @@ static struct cipher_testvec des_cbc_enc_tv_template[] = { { /* From OpenSSL */ .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef}, .klen = 8, - .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}, - .input = { 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x20, - 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74, - 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, + .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}, + .input = { 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x20, + 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, .ilen = 24, - .result = { 0xcc, 0xd1, 0x73, 0xff, 0xab, 0x20, 0x39, 0xf4, - 0xac, 0xd8, 0xae, 0xfd, 0xdf, 0xd8, 0xa1, 0xeb, - 0x46, 0x8e, 0x91, 0x15, 0x78, 0x88, 0xba, 0x68 }, + .result = { 0xcc, 0xd1, 0x73, 0xff, 0xab, 0x20, 0x39, 0xf4, + 0xac, 0xd8, 0xae, 0xfd, 0xdf, 0xd8, 0xa1, 0xeb, + 0x46, 0x8e, 0x91, 0x15, 0x78, 0x88, 0xba, 0x68 }, .rlen = 24, }, { /* FIPS Pub 81 */ .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, - .iv = { 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef }, + .iv = { 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef }, .input = { 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74 }, .ilen = 8, .result = { 0xe5, 0xc7, 0xcd, 0xde, 0x87, 0x2b, 0xf2, 0x7c }, @@ -1117,7 +1117,7 @@ static struct cipher_testvec des_cbc_enc_tv_template[] = { .ilen = 8, .result = { 0x43, 0xe9, 0x34, 0x00, 0x8c, 0x38, 0x9c, 0x0f }, .rlen = 8, - }, { + }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, .iv = { 0x43, 0xe9, 0x34, 0x00, 0x8c, 0x38, 0x9c, 0x0f }, @@ -1125,18 +1125,18 @@ static struct cipher_testvec des_cbc_enc_tv_template[] = { .ilen = 8, .result = { 0x68, 0x37, 0x88, 0x49, 0x9a, 0x7c, 0x05, 0xf6 }, .rlen = 8, - }, { /* Copy of openssl vector for chunk testing */ + }, { /* Copy of openssl vector for chunk testing */ /* From OpenSSL */ .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef}, .klen = 8, - .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}, - .input = { 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x20, - 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74, - 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, + .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}, + .input = { 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x20, + 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, .ilen = 24, - .result = { 0xcc, 0xd1, 0x73, 0xff, 0xab, 0x20, 0x39, 0xf4, - 0xac, 0xd8, 0xae, 0xfd, 0xdf, 0xd8, 0xa1, 0xeb, - 0x46, 0x8e, 0x91, 0x15, 0x78, 0x88, 0xba, 0x68 }, + .result = { 0xcc, 0xd1, 0x73, 0xff, 0xab, 0x20, 0x39, 0xf4, + 0xac, 0xd8, 0xae, 0xfd, 0xdf, 0xd8, 0xa1, 0xeb, + 0x46, 0x8e, 0x91, 0x15, 0x78, 0x88, 0xba, 0x68 }, .rlen = 24, .np = 2, .tap = { 13, 11 } @@ -1155,24 +1155,24 @@ static struct cipher_testvec des_cbc_dec_tv_template[] = { }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, - .iv = { 0xe5, 0xc7, 0xcd, 0xde, 0x87, 0x2b, 0xf2, 0x7c }, + .iv = { 0xe5, 0xc7, 0xcd, 0xde, 0x87, 0x2b, 0xf2, 0x7c }, .input = { 0x43, 0xe9, 0x34, 0x00, 0x8c, 0x38, 0x9c, 0x0f }, .ilen = 8, - .result = { 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, + .result = { 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20 }, .rlen = 8, }, { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, .iv = { 0x43, 0xe9, 0x34, 0x00, 0x8c, 0x38, 0x9c, 0x0f }, - .input = { 0x68, 0x37, 0x88, 0x49, 0x9a, 0x7c, 0x05, 0xf6 }, + .input = { 0x68, 0x37, 0x88, 0x49, 0x9a, 0x7c, 0x05, 0xf6 }, .ilen = 8, .result = { 0x66, 0x6f, 0x72, 0x20, 0x61, 0x6c, 0x6c, 0x20 }, .rlen = 8, - }, { /* Copy of above, for chunk testing */ + }, { /* Copy of above, for chunk testing */ .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, .iv = { 0x43, 0xe9, 0x34, 0x00, 0x8c, 0x38, 0x9c, 0x0f }, - .input = { 0x68, 0x37, 0x88, 0x49, 0x9a, 0x7c, 0x05, 0xf6 }, + .input = { 0x68, 0x37, 0x88, 0x49, 0x9a, 0x7c, 0x05, 0xf6 }, .ilen = 8, .result = { 0x66, 0x6f, 0x72, 0x20, 0x61, 0x6c, 0x6c, 0x20 }, .rlen = 8, @@ -1276,7 +1276,7 @@ static struct cipher_testvec bf_enc_tv_template[] = { .ilen = 8, .result = { 0xe8, 0x7a, 0x24, 0x4e, 0x2c, 0xc8, 0x5e, 0x82 }, .rlen = 8, - }, { /* Vary the keylength... */ + }, { /* Vary the keylength... */ .key = { 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f }, .klen = 16, @@ -1297,9 +1297,9 @@ static struct cipher_testvec bf_enc_tv_template[] = { .key = { 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x04, 0x68, 0x91, 0x04, 0xc2, 0xfd, 0x3b, 0x2f, - 0x58, 0x40, 0x23, 0x64, 0x1a, 0xba, 0x61, 0x76, - 0x1f, 0x1f, 0x1f, 0x1f, 0x0e, 0x0e, 0x0e, 0x0e, + 0x04, 0x68, 0x91, 0x04, 0xc2, 0xfd, 0x3b, 0x2f, + 0x58, 0x40, 0x23, 0x64, 0x1a, 0xba, 0x61, 0x76, + 0x1f, 0x1f, 0x1f, 0x1f, 0x0e, 0x0e, 0x0e, 0x0e, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, .klen = 56, .input = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, @@ -1331,7 +1331,7 @@ static struct cipher_testvec bf_dec_tv_template[] = { .ilen = 8, .result = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, .rlen = 8, - }, { /* Vary the keylength... */ + }, { /* Vary the keylength... */ .key = { 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f }, .klen = 16, @@ -1352,9 +1352,9 @@ static struct cipher_testvec bf_dec_tv_template[] = { .key = { 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x04, 0x68, 0x91, 0x04, 0xc2, 0xfd, 0x3b, 0x2f, - 0x58, 0x40, 0x23, 0x64, 0x1a, 0xba, 0x61, 0x76, - 0x1f, 0x1f, 0x1f, 0x1f, 0x0e, 0x0e, 0x0e, 0x0e, + 0x04, 0x68, 0x91, 0x04, 0xc2, 0xfd, 0x3b, 0x2f, + 0x58, 0x40, 0x23, 0x64, 0x1a, 0xba, 0x61, 0x76, + 0x1f, 0x1f, 0x1f, 0x1f, 0x0e, 0x0e, 0x0e, 0x0e, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, .klen = 56, .input = { 0xc0, 0x45, 0x04, 0x01, 0x2e, 0x4e, 0x1f, 0x53 }, @@ -1369,7 +1369,7 @@ static struct cipher_testvec bf_cbc_enc_tv_template[] = { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87 }, .klen = 16, - .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, + .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, .input = { 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x20, 0x4e, 0x6f, 0x77, 0x20, 0x69, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20, @@ -1388,7 +1388,7 @@ static struct cipher_testvec bf_cbc_dec_tv_template[] = { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87 }, .klen = 16, - .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, + .iv = { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }, .input = { 0x6b, 0x77, 0xb4, 0xd6, 0x30, 0x06, 0xde, 0xe6, 0x05, 0xb1, 0x56, 0xe2, 0x74, 0x03, 0x97, 0x93, 0x58, 0xde, 0xb9, 0xe7, 0x15, 0x46, 0x16, 0xd9, @@ -1490,7 +1490,7 @@ static struct cipher_testvec tf_cbc_enc_tv_template[] = { .key = { [0 ... 15] = 0x00 }, .klen = 16, .iv = { 0x9f, 0x58, 0x9f, 0x5c, 0xf6, 0x12, 0x2c, 0x32, - 0xb6, 0xbf, 0xec, 0x2f, 0x2a, 0xe8, 0xc3, 0x5a }, + 0xb6, 0xbf, 0xec, 0x2f, 0x2a, 0xe8, 0xc3, 0x5a }, .input = { [0 ... 15] = 0x00 }, .ilen = 16, .result = { 0xd4, 0x91, 0xdb, 0x16, 0xe7, 0xb1, 0xc3, 0x9e, @@ -1528,7 +1528,7 @@ static struct cipher_testvec tf_cbc_dec_tv_template[] = { .klen = 16, .iv = { [0 ... 15] = 0x00 }, .input = { 0x9f, 0x58, 0x9f, 0x5c, 0xf6, 0x12, 0x2c, 0x32, - 0xb6, 0xbf, 0xec, 0x2f, 0x2a, 0xe8, 0xc3, 0x5a }, + 0xb6, 0xbf, 0xec, 0x2f, 0x2a, 0xe8, 0xc3, 0x5a }, .ilen = 16, .result = { [0 ... 15] = 0x00 }, .rlen = 16, @@ -1578,8 +1578,7 @@ static struct cipher_testvec tf_cbc_dec_tv_template[] = { #define TNEPRES_ENC_TEST_VECTORS 4 #define TNEPRES_DEC_TEST_VECTORS 4 -static struct cipher_testvec serpent_enc_tv_template[] = -{ +static struct cipher_testvec serpent_enc_tv_template[] = { { .input = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, @@ -1620,8 +1619,7 @@ static struct cipher_testvec serpent_enc_tv_template[] = }, }; -static struct cipher_testvec tnepres_enc_tv_template[] = -{ +static struct cipher_testvec tnepres_enc_tv_template[] = { { /* KeySize=128, PT=0, I=1 */ .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, @@ -1629,7 +1627,7 @@ static struct cipher_testvec tnepres_enc_tv_template[] = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .klen = 16, .ilen = 16, - .result = { 0x49, 0xaf, 0xbf, 0xad, 0x9d, 0x5a, 0x34, 0x05, + .result = { 0x49, 0xaf, 0xbf, 0xad, 0x9d, 0x5a, 0x34, 0x05, 0x2c, 0xd8, 0xff, 0xa5, 0x98, 0x6b, 0xd2, 0xdd }, .rlen = 16, }, { /* KeySize=192, PT=0, I=1 */ @@ -1640,7 +1638,7 @@ static struct cipher_testvec tnepres_enc_tv_template[] = .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .ilen = 16, - .result = { 0xe7, 0x8e, 0x54, 0x02, 0xc7, 0x19, 0x55, 0x68, + .result = { 0xe7, 0x8e, 0x54, 0x02, 0xc7, 0x19, 0x55, 0x68, 0xac, 0x36, 0x78, 0xf7, 0xa3, 0xf6, 0x0c, 0x66 }, .rlen = 16, }, { /* KeySize=256, PT=0, I=1 */ @@ -1652,7 +1650,7 @@ static struct cipher_testvec tnepres_enc_tv_template[] = .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .ilen = 16, - .result = { 0xab, 0xed, 0x96, 0xe7, 0x66, 0xbf, 0x28, 0xcb, + .result = { 0xab, 0xed, 0x96, 0xe7, 0x66, 0xbf, 0x28, 0xcb, 0xc0, 0xeb, 0xd2, 0x1a, 0x82, 0xef, 0x08, 0x19 }, .rlen = 16, }, { /* KeySize=256, I=257 */ @@ -1664,15 +1662,14 @@ static struct cipher_testvec tnepres_enc_tv_template[] = .input = { 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }, .ilen = 16, - .result = { 0x5c, 0xe7, 0x1c, 0x70, 0xd2, 0x88, 0x2e, 0x5b, + .result = { 0x5c, 0xe7, 0x1c, 0x70, 0xd2, 0x88, 0x2e, 0x5b, 0xb8, 0x32, 0xe4, 0x33, 0xf8, 0x9f, 0x26, 0xde }, .rlen = 16, }, }; -static struct cipher_testvec serpent_dec_tv_template[] = -{ +static struct cipher_testvec serpent_dec_tv_template[] = { { .input = { 0x12, 0x07, 0xfc, 0xce, 0x9b, 0xd0, 0xd6, 0x47, 0x6a, 0xe9, 0x8f, 0xbe, 0xd1, 0x43, 0xa0, 0xe2 }, @@ -1713,8 +1710,7 @@ static struct cipher_testvec serpent_dec_tv_template[] = }, }; -static struct cipher_testvec tnepres_dec_tv_template[] = -{ +static struct cipher_testvec tnepres_dec_tv_template[] = { { .input = { 0x41, 0xcc, 0x6b, 0x31, 0x59, 0x31, 0x45, 0x97, 0x6d, 0x6f, 0xbb, 0x38, 0x4b, 0x37, 0x21, 0x28 }, @@ -1726,7 +1722,7 @@ static struct cipher_testvec tnepres_dec_tv_template[] = .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, .klen = 16, - .input = { 0xea, 0xf4, 0xd7, 0xfc, 0xd8, 0x01, 0x34, 0x47, + .input = { 0xea, 0xf4, 0xd7, 0xfc, 0xd8, 0x01, 0x34, 0x47, 0x81, 0x45, 0x0b, 0xfa, 0x0c, 0xd6, 0xad, 0x6e }, .ilen = 16, .result = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -1738,7 +1734,7 @@ static struct cipher_testvec tnepres_dec_tv_template[] = 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }, .klen = 32, - .input = { 0x64, 0xa9, 0x1a, 0x37, 0xed, 0x9f, 0xe7, 0x49, + .input = { 0x64, 0xa9, 0x1a, 0x37, 0xed, 0x9f, 0xe7, 0x49, 0xa8, 0x4e, 0x76, 0xd6, 0xf5, 0x0d, 0x78, 0xee }, .ilen = 16, .result = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -1747,7 +1743,7 @@ static struct cipher_testvec tnepres_dec_tv_template[] = }, { /* KeySize=128, I=121 */ .key = { [15] = 0x80 }, .klen = 16, - .input = { 0x3d, 0xda, 0xbf, 0xc0, 0x06, 0xda, 0xab, 0x06, + .input = { 0x3d, 0xda, 0xbf, 0xc0, 0x06, 0xda, 0xab, 0x06, 0x46, 0x2a, 0xf4, 0xef, 0x81, 0x54, 0x4e, 0x26 }, .ilen = 16, .result = { [0 ... 15] = 0x00 }, @@ -1760,58 +1756,56 @@ static struct cipher_testvec tnepres_dec_tv_template[] = #define CAST6_ENC_TEST_VECTORS 3 #define CAST6_DEC_TEST_VECTORS 3 -static struct cipher_testvec cast6_enc_tv_template[] = -{ +static struct cipher_testvec cast6_enc_tv_template[] = { { - .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, + .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, 0x0a, 0xf7, 0x56, 0x47, 0xf2, 0x9f, 0x61, 0x5d }, .klen = 16, .input = { [0 ... 15] = 0x00 }, .ilen = 16, - .result = { 0xc8, 0x42, 0xa0, 0x89, 0x72, 0xb4, 0x3d, 0x20, + .result = { 0xc8, 0x42, 0xa0, 0x89, 0x72, 0xb4, 0x3d, 0x20, 0x83, 0x6c, 0x91, 0xd1, 0xb7, 0x53, 0x0f, 0x6b }, .rlen = 16, }, { - .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, - 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, + .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, + 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, 0xba, 0xc7, 0x7a, 0x77, 0x17, 0x94, 0x28, 0x63 }, .klen = 24, .input = { [0 ... 15] = 0x00 }, .ilen = 16, - .result = { 0x1b, 0x38, 0x6c, 0x02, 0x10, 0xdc, 0xad, 0xcb, + .result = { 0x1b, 0x38, 0x6c, 0x02, 0x10, 0xdc, 0xad, 0xcb, 0xdd, 0x0e, 0x41, 0xaa, 0x08, 0xa7, 0xa7, 0xe8 }, .rlen = 16, }, { .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, 0x8d, 0x7c, 0x47, 0xce, 0x26, 0x49, 0x08, 0x46, - 0x1c, 0xc1, 0xb5, 0x13, 0x7a, 0xe6, 0xb6, 0x04 }, + 0x1c, 0xc1, 0xb5, 0x13, 0x7a, 0xe6, 0xb6, 0x04 }, .klen = 32, .input = { [0 ... 15] = 0x00 }, .ilen = 16, - .result = { 0x4f, 0x6a, 0x20, 0x38, 0x28, 0x68, 0x97, 0xb9, + .result = { 0x4f, 0x6a, 0x20, 0x38, 0x28, 0x68, 0x97, 0xb9, 0xc9, 0x87, 0x01, 0x36, 0x55, 0x33, 0x17, 0xfa }, .rlen = 16, }, }; -static struct cipher_testvec cast6_dec_tv_template[] = -{ +static struct cipher_testvec cast6_dec_tv_template[] = { { - .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, + .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, 0x0a, 0xf7, 0x56, 0x47, 0xf2, 0x9f, 0x61, 0x5d }, .klen = 16, - .input = { 0xc8, 0x42, 0xa0, 0x89, 0x72, 0xb4, 0x3d, 0x20, + .input = { 0xc8, 0x42, 0xa0, 0x89, 0x72, 0xb4, 0x3d, 0x20, 0x83, 0x6c, 0x91, 0xd1, 0xb7, 0x53, 0x0f, 0x6b }, .ilen = 16, .result = { [0 ... 15] = 0x00 }, .rlen = 16, }, { - .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, - 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, + .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, + 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, 0xba, 0xc7, 0x7a, 0x77, 0x17, 0x94, 0x28, 0x63 }, .klen = 24, - .input = { 0x1b, 0x38, 0x6c, 0x02, 0x10, 0xdc, 0xad, 0xcb, + .input = { 0x1b, 0x38, 0x6c, 0x02, 0x10, 0xdc, 0xad, 0xcb, 0xdd, 0x0e, 0x41, 0xaa, 0x08, 0xa7, 0xa7, 0xe8 }, .ilen = 16, .result = { [0 ... 15] = 0x00 }, @@ -1820,9 +1814,9 @@ static struct cipher_testvec cast6_dec_tv_template[] = .key = { 0x23, 0x42, 0xbb, 0x9e, 0xfa, 0x38, 0x54, 0x2c, 0xbe, 0xd0, 0xac, 0x83, 0x94, 0x0a, 0xc2, 0x98, 0x8d, 0x7c, 0x47, 0xce, 0x26, 0x49, 0x08, 0x46, - 0x1c, 0xc1, 0xb5, 0x13, 0x7a, 0xe6, 0xb6, 0x04 }, + 0x1c, 0xc1, 0xb5, 0x13, 0x7a, 0xe6, 0xb6, 0x04 }, .klen = 32, - .input = { 0x4f, 0x6a, 0x20, 0x38, 0x28, 0x68, 0x97, 0xb9, + .input = { 0x4f, 0x6a, 0x20, 0x38, 0x28, 0x68, 0x97, 0xb9, 0xc9, 0x87, 0x01, 0x36, 0x55, 0x33, 0x17, 0xfa }, .ilen = 16, .result = { [0 ... 15] = 0x00 }, @@ -1837,9 +1831,9 @@ static struct cipher_testvec cast6_dec_tv_template[] = #define AES_ENC_TEST_VECTORS 3 #define AES_DEC_TEST_VECTORS 3 -static struct cipher_testvec aes_enc_tv_template[] = { +static struct cipher_testvec aes_enc_tv_template[] = { { /* From FIPS-197 */ - .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, .klen = 16, .input = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, @@ -1853,7 +1847,7 @@ static struct cipher_testvec aes_enc_tv_template[] = { 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }, .klen = 24, - .input = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + .input = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, .ilen = 16, .result = { 0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, @@ -1865,7 +1859,7 @@ static struct cipher_testvec aes_enc_tv_template[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }, .klen = 32, - .input = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + .input = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, .ilen = 16, .result = { 0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, @@ -1874,9 +1868,9 @@ static struct cipher_testvec aes_enc_tv_template[] = { }, }; -static struct cipher_testvec aes_dec_tv_template[] = { +static struct cipher_testvec aes_dec_tv_template[] = { { /* From FIPS-197 */ - .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, .klen = 16, .input = { 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, @@ -1893,8 +1887,8 @@ static struct cipher_testvec aes_dec_tv_template[] = { .input = { 0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91 }, .ilen = 16, - .result = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + .result = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, .rlen = 16, }, { .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -1905,7 +1899,7 @@ static struct cipher_testvec aes_dec_tv_template[] = { .input = { 0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89 }, .ilen = 16, - .result = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + .result = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, .rlen = 16, }, @@ -1915,8 +1909,7 @@ static struct cipher_testvec aes_dec_tv_template[] = { #define CAST5_ENC_TEST_VECTORS 3 #define CAST5_DEC_TEST_VECTORS 3 -static struct cipher_testvec cast5_enc_tv_template[] = -{ +static struct cipher_testvec cast5_enc_tv_template[] = { { .key = { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, 0x23, 0x45, 0x67, 0x89, 0x34, 0x56, 0x78, 0x9a }, @@ -1943,8 +1936,7 @@ static struct cipher_testvec cast5_enc_tv_template[] = }, }; -static struct cipher_testvec cast5_dec_tv_template[] = -{ +static struct cipher_testvec cast5_dec_tv_template[] = { { .key = { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, 0x23, 0x45, 0x67, 0x89, 0x34, 0x56, 0x78, 0x9a }, @@ -1971,14 +1963,13 @@ static struct cipher_testvec cast5_dec_tv_template[] = }, }; -/* - * ARC4 test vectors from OpenSSL +/* + * ARC4 test vectors from OpenSSL */ #define ARC4_ENC_TEST_VECTORS 7 #define ARC4_DEC_TEST_VECTORS 7 -static struct cipher_testvec arc4_enc_tv_template[] = -{ +static struct cipher_testvec arc4_enc_tv_template[] = { { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, @@ -2044,8 +2035,7 @@ static struct cipher_testvec arc4_enc_tv_template[] = }, }; -static struct cipher_testvec arc4_dec_tv_template[] = -{ +static struct cipher_testvec arc4_dec_tv_template[] = { { .key = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, .klen = 8, @@ -2111,14 +2101,13 @@ static struct cipher_testvec arc4_dec_tv_template[] = }, }; -/* +/* * TEA test vectors */ #define TEA_ENC_TEST_VECTORS 4 #define TEA_DEC_TEST_VECTORS 4 -static struct cipher_testvec tea_enc_tv_template[] = -{ +static struct cipher_testvec tea_enc_tv_template[] = { { .key = { [0 ... 15] = 0x00 }, .klen = 16, @@ -2138,31 +2127,30 @@ static struct cipher_testvec tea_enc_tv_template[] = .key = { 0x09, 0x65, 0x43, 0x11, 0x66, 0x44, 0x39, 0x25, 0x51, 0x3a, 0x16, 0x10, 0x0a, 0x08, 0x12, 0x6e }, .klen = 16, - .input = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, + .input = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x76, 0x65, 0x63, 0x74 }, .ilen = 16, - .result = { 0xbe, 0x7a, 0xbb, 0x81, 0x95, 0x2d, 0x1f, 0x1e, + .result = { 0xbe, 0x7a, 0xbb, 0x81, 0x95, 0x2d, 0x1f, 0x1e, 0xdd, 0x89, 0xa1, 0x25, 0x04, 0x21, 0xdf, 0x95 }, .rlen = 16, }, { .key = { 0x4d, 0x76, 0x32, 0x17, 0x05, 0x3f, 0x75, 0x2c, 0x5d, 0x04, 0x16, 0x36, 0x15, 0x72, 0x63, 0x2f }, .klen = 16, - .input = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, - 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, + .input = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, + 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x6c, 0x79, 0x21, 0x21, 0x21 }, .ilen = 32, - .result = { 0xe0, 0x4d, 0x5d, 0x3c, 0xb7, 0x8c, 0x36, 0x47, - 0x94, 0x18, 0x95, 0x91, 0xa9, 0xfc, 0x49, 0xf8, - 0x44, 0xd1, 0x2d, 0xc2, 0x99, 0xb8, 0x08, 0x2a, + .result = { 0xe0, 0x4d, 0x5d, 0x3c, 0xb7, 0x8c, 0x36, 0x47, + 0x94, 0x18, 0x95, 0x91, 0xa9, 0xfc, 0x49, 0xf8, + 0x44, 0xd1, 0x2d, 0xc2, 0x99, 0xb8, 0x08, 0x2a, 0x07, 0x89, 0x73, 0xc2, 0x45, 0x92, 0xc6, 0x90 }, .rlen = 32, } }; -static struct cipher_testvec tea_dec_tv_template[] = -{ +static struct cipher_testvec tea_dec_tv_template[] = { { .key = { [0 ... 15] = 0x00 }, .klen = 16, @@ -2183,9 +2171,9 @@ static struct cipher_testvec tea_dec_tv_template[] = 0x51, 0x3a, 0x16, 0x10, 0x0a, 0x08, 0x12, 0x6e }, .klen = 16, .input = { 0xbe, 0x7a, 0xbb, 0x81, 0x95, 0x2d, 0x1f, 0x1e, - 0xdd, 0x89, 0xa1, 0x25, 0x04, 0x21, 0xdf, 0x95 }, - .ilen = 16, - .result = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, + 0xdd, 0x89, 0xa1, 0x25, 0x04, 0x21, 0xdf, 0x95 }, + .ilen = 16, + .result = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x76, 0x65, 0x63, 0x74 }, .rlen = 16, }, { @@ -2193,26 +2181,25 @@ static struct cipher_testvec tea_dec_tv_template[] = 0x5d, 0x04, 0x16, 0x36, 0x15, 0x72, 0x63, 0x2f }, .klen = 16, .input = { 0xe0, 0x4d, 0x5d, 0x3c, 0xb7, 0x8c, 0x36, 0x47, - 0x94, 0x18, 0x95, 0x91, 0xa9, 0xfc, 0x49, 0xf8, - 0x44, 0xd1, 0x2d, 0xc2, 0x99, 0xb8, 0x08, 0x2a, - 0x07, 0x89, 0x73, 0xc2, 0x45, 0x92, 0xc6, 0x90 }, + 0x94, 0x18, 0x95, 0x91, 0xa9, 0xfc, 0x49, 0xf8, + 0x44, 0xd1, 0x2d, 0xc2, 0x99, 0xb8, 0x08, 0x2a, + 0x07, 0x89, 0x73, 0xc2, 0x45, 0x92, 0xc6, 0x90 }, .ilen = 32, - .result = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, - 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, + .result = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, + 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x6c, 0x79, 0x21, 0x21, 0x21 }, .rlen = 32, } }; -/* - * XTEA test vectors +/* + * XTEA test vectors */ #define XTEA_ENC_TEST_VECTORS 4 #define XTEA_DEC_TEST_VECTORS 4 -static struct cipher_testvec xtea_enc_tv_template[] = -{ +static struct cipher_testvec xtea_enc_tv_template[] = { { .key = { [0 ... 15] = 0x00 }, .klen = 16, @@ -2232,31 +2219,30 @@ static struct cipher_testvec xtea_enc_tv_template[] = .key = { 0x09, 0x65, 0x43, 0x11, 0x66, 0x44, 0x39, 0x25, 0x51, 0x3a, 0x16, 0x10, 0x0a, 0x08, 0x12, 0x6e }, .klen = 16, - .input = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, + .input = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x76, 0x65, 0x63, 0x74 }, .ilen = 16, - .result = { 0xe2, 0x04, 0xdb, 0xf2, 0x89, 0x85, 0x9e, 0xea, + .result = { 0xe2, 0x04, 0xdb, 0xf2, 0x89, 0x85, 0x9e, 0xea, 0x61, 0x35, 0xaa, 0xed, 0xb5, 0xcb, 0x71, 0x2c }, .rlen = 16, }, { .key = { 0x4d, 0x76, 0x32, 0x17, 0x05, 0x3f, 0x75, 0x2c, 0x5d, 0x04, 0x16, 0x36, 0x15, 0x72, 0x63, 0x2f }, .klen = 16, - .input = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, - 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, + .input = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, + 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x6c, 0x79, 0x21, 0x21, 0x21 }, .ilen = 32, - .result = { 0x0b, 0x03, 0xcd, 0x8a, 0xbe, 0x95, 0xfd, 0xb1, - 0xc1, 0x44, 0x91, 0x0b, 0xa5, 0xc9, 0x1b, 0xb4, - 0xa9, 0xda, 0x1e, 0x9e, 0xb1, 0x3e, 0x2a, 0x8f, + .result = { 0x0b, 0x03, 0xcd, 0x8a, 0xbe, 0x95, 0xfd, 0xb1, + 0xc1, 0x44, 0x91, 0x0b, 0xa5, 0xc9, 0x1b, 0xb4, + 0xa9, 0xda, 0x1e, 0x9e, 0xb1, 0x3e, 0x2a, 0x8f, 0xea, 0xa5, 0x6a, 0x85, 0xd1, 0xf4, 0xa8, 0xa5 }, .rlen = 32, } }; -static struct cipher_testvec xtea_dec_tv_template[] = -{ +static struct cipher_testvec xtea_dec_tv_template[] = { { .key = { [0 ... 15] = 0x00 }, .klen = 16, @@ -2276,24 +2262,24 @@ static struct cipher_testvec xtea_dec_tv_template[] = .key = { 0x09, 0x65, 0x43, 0x11, 0x66, 0x44, 0x39, 0x25, 0x51, 0x3a, 0x16, 0x10, 0x0a, 0x08, 0x12, 0x6e }, .klen = 16, - .input = { 0xe2, 0x04, 0xdb, 0xf2, 0x89, 0x85, 0x9e, 0xea, + .input = { 0xe2, 0x04, 0xdb, 0xf2, 0x89, 0x85, 0x9e, 0xea, 0x61, 0x35, 0xaa, 0xed, 0xb5, 0xcb, 0x71, 0x2c }, .ilen = 16, - .result = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, + .result = { 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x76, 0x65, 0x63, 0x74 }, .rlen = 16, }, { .key = { 0x4d, 0x76, 0x32, 0x17, 0x05, 0x3f, 0x75, 0x2c, 0x5d, 0x04, 0x16, 0x36, 0x15, 0x72, 0x63, 0x2f }, .klen = 16, - .input = { 0x0b, 0x03, 0xcd, 0x8a, 0xbe, 0x95, 0xfd, 0xb1, - 0xc1, 0x44, 0x91, 0x0b, 0xa5, 0xc9, 0x1b, 0xb4, - 0xa9, 0xda, 0x1e, 0x9e, 0xb1, 0x3e, 0x2a, 0x8f, + .input = { 0x0b, 0x03, 0xcd, 0x8a, 0xbe, 0x95, 0xfd, 0xb1, + 0xc1, 0x44, 0x91, 0x0b, 0xa5, 0xc9, 0x1b, 0xb4, + 0xa9, 0xda, 0x1e, 0x9e, 0xb1, 0x3e, 0x2a, 0x8f, 0xea, 0xa5, 0x6a, 0x85, 0xd1, 0xf4, 0xa8, 0xa5 }, .ilen = 32, - .result = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, - 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, + .result = { 0x54, 0x65, 0x61, 0x20, 0x69, 0x73, 0x20, 0x67, + 0x6f, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x79, 0x6f, 0x75, 0x21, 0x21, 0x21, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x6c, 0x79, 0x21, 0x21, 0x21 }, .rlen = 32, } @@ -2305,9 +2291,9 @@ static struct cipher_testvec xtea_dec_tv_template[] = #define KHAZAD_ENC_TEST_VECTORS 5 #define KHAZAD_DEC_TEST_VECTORS 5 -static struct cipher_testvec khazad_enc_tv_template[] = { - { - .key = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +static struct cipher_testvec khazad_enc_tv_template[] = { + { + .key = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .klen = 16, .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, @@ -2351,9 +2337,9 @@ static struct cipher_testvec khazad_enc_tv_template[] = { }, }; -static struct cipher_testvec khazad_dec_tv_template[] = { +static struct cipher_testvec khazad_dec_tv_template[] = { { - .key = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + .key = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .klen = 16, .input = { 0X49, 0Xa4, 0Xce, 0X32, 0Xac, 0X19, 0X0e, 0X3f }, @@ -2697,8 +2683,7 @@ static struct comp_testvec deflate_decomp_tv_template[] = { */ #define MICHAEL_MIC_TEST_VECTORS 6 -static struct hash_testvec michael_mic_tv_template[] = -{ +static struct hash_testvec michael_mic_tv_template[] = { { .key = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, .ksize = 8, From 3cc3816f93e3f94f88503da8e6090302fa986bd6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Jun 2005 13:26:36 -0700 Subject: [PATCH 004/199] [CRYPTO]: Kill unnecessary strncpy from tcrypt It seems that bad code tends to get copied (see test_cipher_speed). So let's kill this idiom before it spreads any further. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- crypto/tcrypt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 5a95b4a14c2b..85a88a71ff53 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -266,16 +266,16 @@ static void test_cipher(char *algo, int mode, int enc, char *key; struct cipher_testvec *cipher_tv; struct scatterlist sg[8]; - char e[11], m[4]; + const char *e, *m; if (enc == ENCRYPT) - strncpy(e, "encryption", 11); + e = "encryption"; else - strncpy(e, "decryption", 11); + e = "decryption"; if (mode == MODE_ECB) - strncpy(m, "ECB", 4); + m = "ECB"; else - strncpy(m, "CBC", 4); + m = "CBC"; printk("\ntesting %s %s %s\n", algo, m, e); From ebfd9bcf16e4aaddcfe2d1b76b50e3dd6d3242e2 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 22 Jun 2005 13:27:23 -0700 Subject: [PATCH 005/199] [CRYPTO]: Add cipher speed tests From: Reyk Floeter I recently had the requirement to do some benchmarking on cryptoapi, and I found reyk's very useful performance test patch [1]. However, I could not find any discussion on why that extension (or something providing a similar feature but different implementation) was not merged into mainline. If there was such a discussion, can someone please point me to the archive[s]? I've now merged the old patch into 2.6.12-rc1, the result can be found attached to this email. [1] http://lists.logix.cz/pipermail/padlock/2004/000010.html Signed-off-by: Harald Welte Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- crypto/tcrypt.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++-- crypto/tcrypt.h | 92 ++++++++++++++++++++++++++++++++- 2 files changed, 222 insertions(+), 4 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 85a88a71ff53..414ef5e71171 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -12,8 +12,9 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. * - * 14 - 09 - 2003 - * Rewritten by Kartikey Mahendra Bhatt + * 2004-08-09 Added cipher speed tests (Reyk Floeter ) + * 2003-09-14 Rewritten by Kartikey Mahendra Bhatt + * */ #include @@ -25,12 +26,13 @@ #include #include #include +#include #include "tcrypt.h" /* * Need to kmalloc() memory for testing kmap(). */ -#define TVMEMSIZE 4096 +#define TVMEMSIZE 16384 #define XBUFSIZE 32768 /* @@ -55,6 +57,11 @@ static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 }; +/* + * Used by test_cipher_speed() + */ +static unsigned int sec = 10; + static int mode; static char *xbuf; static char *tvmem; @@ -419,6 +426,90 @@ static void test_cipher(char *algo, int mode, int enc, crypto_free_tfm(tfm); } +static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, + struct cipher_speed *speed) +{ + unsigned int ret, i, iv_len; + unsigned char *key, *p, iv[128]; + struct crypto_tfm *tfm; + struct scatterlist sg[8]; + unsigned long start, bcount; + const char *e, *m; + + if (enc == ENCRYPT) + e = "encryption"; + else + e = "decryption"; + if (mode == MODE_ECB) + m = "ECB"; + else + m = "CBC"; + + printk("\ntesting speed of %s %s %s\n", algo, m, e); + + if (mode) + tfm = crypto_alloc_tfm(algo, 0); + else + tfm = crypto_alloc_tfm(algo, CRYPTO_TFM_MODE_CBC); + + if (tfm == NULL) { + printk("failed to load transform for %s %s\n", algo, m); + return; + } + + for (i = 0; speed[i].klen != 0; i++) { + if ((speed[i].blen + speed[i].klen) > TVMEMSIZE) { + printk("template (%u) too big for tvmem (%u)\n", + speed[i].blen + speed[i].klen, TVMEMSIZE); + goto out; + } + + printk("test %u (%d bit key, %d byte blocks): ", i, + speed[i].klen * 8, speed[i].blen); + + memset(tvmem, 0xff, speed[i].klen + speed[i].blen); + + /* set key, plain text and IV */ + key = (unsigned char *)tvmem; + p = (unsigned char *)tvmem + speed[i].klen; + + ret = crypto_cipher_setkey(tfm, key, speed[i].klen); + if (ret) { + printk("setkey() failed flags=%x\n", tfm->crt_flags); + goto out; + } + + if (!mode) { + iv_len = crypto_tfm_alg_ivsize(tfm); + memset(&iv, 0xff, iv_len); + crypto_cipher_set_iv(tfm, iv, iv_len); + } + + for (start = jiffies, bcount = 0; + ((jiffies - start) / HZ) < sec; bcount++) { + sg[0].page = virt_to_page(p); + sg[0].offset = offset_in_page(p); + sg[0].length = speed[i].blen; + + if (enc) + ret = crypto_cipher_encrypt(tfm, sg, sg, speed[i].blen); + else + ret = crypto_cipher_decrypt(tfm, sg, sg, speed[i].blen); + + if (ret) { + printk("%s () failed flags=%x\n", e, tfm->crt_flags); + goto out; + } + } + + printk("%lu operations in %u seconds (%lu bytes)\n", + bcount, sec, bcount * speed[i].blen); + } + +out: + crypto_free_tfm(tfm); +} + static void test_deflate(void) { unsigned int i; @@ -861,6 +952,41 @@ static void do_test(void) #endif + case 200: + test_cipher_speed("aes", MODE_ECB, ENCRYPT, sec, aes_speed_template); + test_cipher_speed("aes", MODE_ECB, DECRYPT, sec, aes_speed_template); + test_cipher_speed("aes", MODE_CBC, ENCRYPT, sec, aes_speed_template); + test_cipher_speed("aes", MODE_CBC, DECRYPT, sec, aes_speed_template); + break; + + case 201: + test_cipher_speed("des3_ede", MODE_ECB, ENCRYPT, sec, des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_ECB, DECRYPT, sec, des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_CBC, ENCRYPT, sec, des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_CBC, DECRYPT, sec, des3_ede_speed_template); + break; + + case 202: + test_cipher_speed("twofish", MODE_ECB, ENCRYPT, sec, twofish_speed_template); + test_cipher_speed("twofish", MODE_ECB, DECRYPT, sec, twofish_speed_template); + test_cipher_speed("twofish", MODE_CBC, ENCRYPT, sec, twofish_speed_template); + test_cipher_speed("twofish", MODE_CBC, DECRYPT, sec, twofish_speed_template); + break; + + case 203: + test_cipher_speed("blowfish", MODE_ECB, ENCRYPT, sec, blowfish_speed_template); + test_cipher_speed("blowfish", MODE_ECB, DECRYPT, sec, blowfish_speed_template); + test_cipher_speed("blowfish", MODE_CBC, ENCRYPT, sec, blowfish_speed_template); + test_cipher_speed("blowfish", MODE_CBC, DECRYPT, sec, blowfish_speed_template); + break; + + case 204: + test_cipher_speed("des", MODE_ECB, ENCRYPT, sec, des_speed_template); + test_cipher_speed("des", MODE_ECB, DECRYPT, sec, des_speed_template); + test_cipher_speed("des", MODE_CBC, ENCRYPT, sec, des_speed_template); + test_cipher_speed("des", MODE_CBC, DECRYPT, sec, des_speed_template); + break; + case 1000: test_available(); break; @@ -901,6 +1027,8 @@ module_init(init); module_exit(fini); module_param(mode, int, 0); +module_param(sec, uint, 0); +MODULE_PARM_DESC(sec, "Length in seconds of speed tests"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Quick & dirty crypto testing module"); diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h index 72d40704042f..c01a0ce9b40a 100644 --- a/crypto/tcrypt.h +++ b/crypto/tcrypt.h @@ -12,7 +12,8 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. * - * 14 - 09 - 2003 Changes by Kartikey Mahendra Bhatt + * 2004-08-09 Cipher speed tests by Reyk Floeter + * 2003-09-14 Changes by Kartikey Mahendra Bhatt * */ #ifndef _CRYPTO_TCRYPT_H @@ -58,6 +59,11 @@ struct cipher_testvec { unsigned char tap[MAX_TAP]; }; +struct cipher_speed { + unsigned char klen; + unsigned int blen; +}; + /* * MD4 test vectors from RFC1320 */ @@ -2728,4 +2734,88 @@ static struct hash_testvec michael_mic_tv_template[] = { } }; +/* + * Cipher speed tests + */ +static struct cipher_speed aes_speed_template[] = { + { .klen = 16, .blen = 16, }, + { .klen = 16, .blen = 64, }, + { .klen = 16, .blen = 256, }, + { .klen = 16, .blen = 1024, }, + { .klen = 16, .blen = 8192, }, + { .klen = 24, .blen = 16, }, + { .klen = 24, .blen = 64, }, + { .klen = 24, .blen = 256, }, + { .klen = 24, .blen = 1024, }, + { .klen = 24, .blen = 8192, }, + { .klen = 32, .blen = 16, }, + { .klen = 32, .blen = 64, }, + { .klen = 32, .blen = 256, }, + { .klen = 32, .blen = 1024, }, + { .klen = 32, .blen = 8192, }, + + /* End marker */ + { .klen = 0, .blen = 0, } +}; + +static struct cipher_speed des3_ede_speed_template[] = { + { .klen = 24, .blen = 16, }, + { .klen = 24, .blen = 64, }, + { .klen = 24, .blen = 256, }, + { .klen = 24, .blen = 1024, }, + { .klen = 24, .blen = 8192, }, + + /* End marker */ + { .klen = 0, .blen = 0, } +}; + +static struct cipher_speed twofish_speed_template[] = { + { .klen = 16, .blen = 16, }, + { .klen = 16, .blen = 64, }, + { .klen = 16, .blen = 256, }, + { .klen = 16, .blen = 1024, }, + { .klen = 16, .blen = 8192, }, + { .klen = 24, .blen = 16, }, + { .klen = 24, .blen = 64, }, + { .klen = 24, .blen = 256, }, + { .klen = 24, .blen = 1024, }, + { .klen = 24, .blen = 8192, }, + { .klen = 32, .blen = 16, }, + { .klen = 32, .blen = 64, }, + { .klen = 32, .blen = 256, }, + { .klen = 32, .blen = 1024, }, + { .klen = 32, .blen = 8192, }, + + /* End marker */ + { .klen = 0, .blen = 0, } +}; + +static struct cipher_speed blowfish_speed_template[] = { + /* Don't support blowfish keys > 256 bit in this test */ + { .klen = 8, .blen = 16, }, + { .klen = 8, .blen = 64, }, + { .klen = 8, .blen = 256, }, + { .klen = 8, .blen = 1024, }, + { .klen = 8, .blen = 8192, }, + { .klen = 32, .blen = 16, }, + { .klen = 32, .blen = 64, }, + { .klen = 32, .blen = 256, }, + { .klen = 32, .blen = 1024, }, + { .klen = 32, .blen = 8192, }, + + /* End marker */ + { .klen = 0, .blen = 0, } +}; + +static struct cipher_speed des_speed_template[] = { + { .klen = 8, .blen = 16, }, + { .klen = 8, .blen = 64, }, + { .klen = 8, .blen = 256, }, + { .klen = 8, .blen = 1024, }, + { .klen = 8, .blen = 8192, }, + + /* End marker */ + { .klen = 0, .blen = 0, } +}; + #endif /* _CRYPTO_TCRYPT_H */ From dce907c00ff246a1fbb2b619964753ebc046591d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Jun 2005 13:27:51 -0700 Subject: [PATCH 006/199] [CRYPTO]: Use template keys for speed tests if possible The existing keys used in the speed tests do not pass the 3DES quality check. This patch makes it use the template keys instead. Other algorithms can supply template keys through the same interface if needed. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- crypto/tcrypt.c | 79 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 414ef5e71171..401d25ac214f 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -427,9 +427,10 @@ static void test_cipher(char *algo, int mode, int enc, } static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, - struct cipher_speed *speed) + struct cipher_testvec *template, + unsigned int tcount, struct cipher_speed *speed) { - unsigned int ret, i, iv_len; + unsigned int ret, i, j, iv_len; unsigned char *key, *p, iv[128]; struct crypto_tfm *tfm; struct scatterlist sg[8]; @@ -471,6 +472,12 @@ static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, /* set key, plain text and IV */ key = (unsigned char *)tvmem; + for (j = 0; j < tcount; j++) { + if (template[j].klen == speed[i].klen) { + key = template[j].key; + break; + } + } p = (unsigned char *)tvmem + speed[i].klen; ret = crypto_cipher_setkey(tfm, key, speed[i].klen); @@ -953,38 +960,66 @@ static void do_test(void) #endif case 200: - test_cipher_speed("aes", MODE_ECB, ENCRYPT, sec, aes_speed_template); - test_cipher_speed("aes", MODE_ECB, DECRYPT, sec, aes_speed_template); - test_cipher_speed("aes", MODE_CBC, ENCRYPT, sec, aes_speed_template); - test_cipher_speed("aes", MODE_CBC, DECRYPT, sec, aes_speed_template); + test_cipher_speed("aes", MODE_ECB, ENCRYPT, sec, NULL, 0, + aes_speed_template); + test_cipher_speed("aes", MODE_ECB, DECRYPT, sec, NULL, 0, + aes_speed_template); + test_cipher_speed("aes", MODE_CBC, ENCRYPT, sec, NULL, 0, + aes_speed_template); + test_cipher_speed("aes", MODE_CBC, DECRYPT, sec, NULL, 0, + aes_speed_template); break; case 201: - test_cipher_speed("des3_ede", MODE_ECB, ENCRYPT, sec, des3_ede_speed_template); - test_cipher_speed("des3_ede", MODE_ECB, DECRYPT, sec, des3_ede_speed_template); - test_cipher_speed("des3_ede", MODE_CBC, ENCRYPT, sec, des3_ede_speed_template); - test_cipher_speed("des3_ede", MODE_CBC, DECRYPT, sec, des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_ECB, ENCRYPT, sec, + des3_ede_enc_tv_template, + DES3_EDE_ENC_TEST_VECTORS, + des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_ECB, DECRYPT, sec, + des3_ede_dec_tv_template, + DES3_EDE_DEC_TEST_VECTORS, + des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_CBC, ENCRYPT, sec, + des3_ede_enc_tv_template, + DES3_EDE_ENC_TEST_VECTORS, + des3_ede_speed_template); + test_cipher_speed("des3_ede", MODE_CBC, DECRYPT, sec, + des3_ede_dec_tv_template, + DES3_EDE_DEC_TEST_VECTORS, + des3_ede_speed_template); break; case 202: - test_cipher_speed("twofish", MODE_ECB, ENCRYPT, sec, twofish_speed_template); - test_cipher_speed("twofish", MODE_ECB, DECRYPT, sec, twofish_speed_template); - test_cipher_speed("twofish", MODE_CBC, ENCRYPT, sec, twofish_speed_template); - test_cipher_speed("twofish", MODE_CBC, DECRYPT, sec, twofish_speed_template); + test_cipher_speed("twofish", MODE_ECB, ENCRYPT, sec, NULL, 0, + twofish_speed_template); + test_cipher_speed("twofish", MODE_ECB, DECRYPT, sec, NULL, 0, + twofish_speed_template); + test_cipher_speed("twofish", MODE_CBC, ENCRYPT, sec, NULL, 0, + twofish_speed_template); + test_cipher_speed("twofish", MODE_CBC, DECRYPT, sec, NULL, 0, + twofish_speed_template); break; case 203: - test_cipher_speed("blowfish", MODE_ECB, ENCRYPT, sec, blowfish_speed_template); - test_cipher_speed("blowfish", MODE_ECB, DECRYPT, sec, blowfish_speed_template); - test_cipher_speed("blowfish", MODE_CBC, ENCRYPT, sec, blowfish_speed_template); - test_cipher_speed("blowfish", MODE_CBC, DECRYPT, sec, blowfish_speed_template); + test_cipher_speed("blowfish", MODE_ECB, ENCRYPT, sec, NULL, 0, + blowfish_speed_template); + test_cipher_speed("blowfish", MODE_ECB, DECRYPT, sec, NULL, 0, + blowfish_speed_template); + test_cipher_speed("blowfish", MODE_CBC, ENCRYPT, sec, NULL, 0, + blowfish_speed_template); + test_cipher_speed("blowfish", MODE_CBC, DECRYPT, sec, NULL, 0, + blowfish_speed_template); break; case 204: - test_cipher_speed("des", MODE_ECB, ENCRYPT, sec, des_speed_template); - test_cipher_speed("des", MODE_ECB, DECRYPT, sec, des_speed_template); - test_cipher_speed("des", MODE_CBC, ENCRYPT, sec, des_speed_template); - test_cipher_speed("des", MODE_CBC, DECRYPT, sec, des_speed_template); + test_cipher_speed("des", MODE_ECB, ENCRYPT, sec, NULL, 0, + des_speed_template); + test_cipher_speed("des", MODE_ECB, DECRYPT, sec, NULL, 0, + des_speed_template); + test_cipher_speed("des", MODE_CBC, ENCRYPT, sec, NULL, 0, + des_speed_template); + test_cipher_speed("des", MODE_CBC, DECRYPT, sec, NULL, 0, + des_speed_template); break; case 1000: From 6a17944ca12229036a6d8d48be1b5eb51204fcf8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Jun 2005 13:29:03 -0700 Subject: [PATCH 007/199] [CRYPTO]: Use CPU cycle counters in tcrypt After using this facility for a while to test my changes to the cipher crypt() layer, I realised that I should've listend to Dave and made this thing use CPU cycle counters :) As it is it's too jittery for me to feel safe about relying on the results. So here is a patch to make it use CPU cycles by default but fall back to jiffies if the user specifies a non-zero sec value. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- crypto/tcrypt.c | 116 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 21 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 401d25ac214f..bd7524cfff33 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include "tcrypt.h" /* @@ -60,7 +62,7 @@ static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 }; /* * Used by test_cipher_speed() */ -static unsigned int sec = 10; +static unsigned int sec; static int mode; static char *xbuf; @@ -426,6 +428,88 @@ static void test_cipher(char *algo, int mode, int enc, crypto_free_tfm(tfm); } +static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p, + int blen, int sec) +{ + struct scatterlist sg[8]; + unsigned long start, end; + int bcount; + int ret; + + sg[0].page = virt_to_page(p); + sg[0].offset = offset_in_page(p); + sg[0].length = blen; + + for (start = jiffies, end = start + sec * HZ, bcount = 0; + time_before(jiffies, end); bcount++) { + if (enc) + ret = crypto_cipher_encrypt(tfm, sg, sg, blen); + else + ret = crypto_cipher_decrypt(tfm, sg, sg, blen); + + if (ret) + return ret; + } + + printk("%d operations in %d seconds (%ld bytes)\n", + bcount, sec, (long)bcount * blen); + return 0; +} + +static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p, + int blen) +{ + struct scatterlist sg[8]; + unsigned long cycles = 0; + int ret = 0; + int i; + + sg[0].page = virt_to_page(p); + sg[0].offset = offset_in_page(p); + sg[0].length = blen; + + local_bh_disable(); + local_irq_disable(); + + /* Warm-up run. */ + for (i = 0; i < 4; i++) { + if (enc) + ret = crypto_cipher_encrypt(tfm, sg, sg, blen); + else + ret = crypto_cipher_decrypt(tfm, sg, sg, blen); + + if (ret) + goto out; + } + + /* The real thing. */ + for (i = 0; i < 8; i++) { + cycles_t start, end; + + start = get_cycles(); + if (enc) + ret = crypto_cipher_encrypt(tfm, sg, sg, blen); + else + ret = crypto_cipher_decrypt(tfm, sg, sg, blen); + end = get_cycles(); + + if (ret) + goto out; + + cycles += end - start; + } + +out: + local_irq_enable(); + local_bh_enable(); + + if (ret == 0) + printk("1 operation in %lu cycles (%d bytes)\n", + (cycles + 4) / 8, blen); + + return ret; +} + static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, struct cipher_testvec *template, unsigned int tcount, struct cipher_speed *speed) @@ -433,8 +517,6 @@ static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, unsigned int ret, i, j, iv_len; unsigned char *key, *p, iv[128]; struct crypto_tfm *tfm; - struct scatterlist sg[8]; - unsigned long start, bcount; const char *e, *m; if (enc == ENCRYPT) @@ -492,25 +574,16 @@ static void test_cipher_speed(char *algo, int mode, int enc, unsigned int sec, crypto_cipher_set_iv(tfm, iv, iv_len); } - for (start = jiffies, bcount = 0; - ((jiffies - start) / HZ) < sec; bcount++) { - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = speed[i].blen; + if (sec) + ret = test_cipher_jiffies(tfm, enc, p, speed[i].blen, + sec); + else + ret = test_cipher_cycles(tfm, enc, p, speed[i].blen); - if (enc) - ret = crypto_cipher_encrypt(tfm, sg, sg, speed[i].blen); - else - ret = crypto_cipher_decrypt(tfm, sg, sg, speed[i].blen); - - if (ret) { - printk("%s () failed flags=%x\n", e, tfm->crt_flags); - goto out; - } + if (ret) { + printk("%s() failed flags=%x\n", e, tfm->crt_flags); + break; } - - printk("%lu operations in %u seconds (%lu bytes)\n", - bcount, sec, bcount * speed[i].blen); } out: @@ -1063,7 +1136,8 @@ module_exit(fini); module_param(mode, int, 0); module_param(sec, uint, 0); -MODULE_PARM_DESC(sec, "Length in seconds of speed tests"); +MODULE_PARM_DESC(sec, "Length in seconds of speed tests " + "(defaults to zero which uses CPU cycles instead)"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Quick & dirty crypto testing module"); From f31f5f051269746179b01017fc5e3dcf6b37c67e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Jun 2005 14:32:51 -0700 Subject: [PATCH 008/199] [NET]: dont use strlen() but the result from a prior sprintf() Small patch to save an unecessary call to strlen() : sprintf() gave us the length, just trust it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/socket.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/socket.c b/net/socket.c index 38729af09461..6f2a17881972 100644 --- a/net/socket.c +++ b/net/socket.c @@ -383,9 +383,8 @@ int sock_map_fd(struct socket *sock) goto out; } - sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); + this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); this.name = name; - this.len = strlen(name); this.hash = SOCK_INODE(sock)->i_ino; file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); From 10f7e7c15e6ce41799c5dba6925ae4bf8048c870 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:07 +1000 Subject: [PATCH 009/199] [PATCH] ppc64: consolidate calibrate_decr implementations pSeries and maple have almost the same code for calibrate_decr, and BPA would need yet another copy. Instead, I'm moving the code to arch/ppc64/kernel/time.c. Some of the related declarations were missing from header files, so I'm moving those as well. It makes sense to merge this with the pmac function of the same name, so we end up having just one implemetation for iSeries and one for Open Firmware based machines. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/iSeries_setup.c | 5 --- arch/ppc64/kernel/maple_setup.c | 2 +- arch/ppc64/kernel/maple_time.c | 51 ----------------------- arch/ppc64/kernel/pSeries_setup.c | 69 +------------------------------ arch/ppc64/kernel/pmac_time.c | 8 +--- arch/ppc64/kernel/setup.c | 5 +-- arch/ppc64/kernel/time.c | 63 ++++++++++++++++++++++++++++ include/asm-ppc64/time.h | 9 ++++ 8 files changed, 76 insertions(+), 136 deletions(-) diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index b31962436fe3..86966ce76b58 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -671,9 +671,6 @@ static void __init iSeries_bolt_kernel(unsigned long saddr, unsigned long eaddr) } } -extern unsigned long ppc_proc_freq; -extern unsigned long ppc_tb_freq; - /* * Document me. */ @@ -772,8 +769,6 @@ static void iSeries_halt(void) mf_power_off(); } -extern void setup_default_decr(void); - /* * void __init iSeries_calibrate_decr() * diff --git a/arch/ppc64/kernel/maple_setup.c b/arch/ppc64/kernel/maple_setup.c index 8cf95a27178e..3dabedb8553b 100644 --- a/arch/ppc64/kernel/maple_setup.c +++ b/arch/ppc64/kernel/maple_setup.c @@ -235,6 +235,6 @@ struct machdep_calls __initdata maple_md = { .get_boot_time = maple_get_boot_time, .set_rtc_time = maple_set_rtc_time, .get_rtc_time = maple_get_rtc_time, - .calibrate_decr = maple_calibrate_decr, + .calibrate_decr = generic_calibrate_decr, .progress = maple_progress, }; diff --git a/arch/ppc64/kernel/maple_time.c b/arch/ppc64/kernel/maple_time.c index 07ce7895b43d..d65210abcd03 100644 --- a/arch/ppc64/kernel/maple_time.c +++ b/arch/ppc64/kernel/maple_time.c @@ -42,11 +42,8 @@ #define DBG(x...) #endif -extern void setup_default_decr(void); extern void GregorianDay(struct rtc_time * tm); -extern unsigned long ppc_tb_freq; -extern unsigned long ppc_proc_freq; static int maple_rtc_addr; static int maple_clock_read(int addr) @@ -176,51 +173,3 @@ void __init maple_get_boot_time(struct rtc_time *tm) maple_get_rtc_time(tm); } -/* XXX FIXME: Some sane defaults: 125 MHz timebase, 1GHz processor */ -#define DEFAULT_TB_FREQ 125000000UL -#define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) - -void __init maple_calibrate_decr(void) -{ - struct device_node *cpu; - struct div_result divres; - unsigned int *fp = NULL; - - /* - * The cpu node should have a timebase-frequency property - * to tell us the rate at which the decrementer counts. - */ - cpu = of_find_node_by_type(NULL, "cpu"); - - ppc_tb_freq = DEFAULT_TB_FREQ; - if (cpu != 0) - fp = (unsigned int *)get_property(cpu, "timebase-frequency", NULL); - if (fp != NULL) - ppc_tb_freq = *fp; - else - printk(KERN_ERR "WARNING: Estimating decrementer frequency (not found)\n"); - fp = NULL; - ppc_proc_freq = DEFAULT_PROC_FREQ; - if (cpu != 0) - fp = (unsigned int *)get_property(cpu, "clock-frequency", NULL); - if (fp != NULL) - ppc_proc_freq = *fp; - else - printk(KERN_ERR "WARNING: Estimating processor frequency (not found)\n"); - - of_node_put(cpu); - - printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n", - ppc_tb_freq/1000000, ppc_tb_freq%1000000); - printk(KERN_INFO "time_init: processor frequency = %lu.%.6lu MHz\n", - ppc_proc_freq/1000000, ppc_proc_freq%1000000); - - tb_ticks_per_jiffy = ppc_tb_freq / HZ; - tb_ticks_per_sec = tb_ticks_per_jiffy * HZ; - tb_ticks_per_usec = ppc_tb_freq / 1000000; - tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); - div128_by_32(1024*1024, 0, tb_ticks_per_sec, &divres); - tb_to_xs = divres.result_low; - - setup_default_decr(); -} diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 6c0d1d58a552..f9a310c0c8d7 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -84,9 +84,6 @@ extern void generic_find_legacy_serial_ports(u64 *physport, int fwnmi_active; /* TRUE if an FWNMI handler is present */ -extern unsigned long ppc_proc_freq; -extern unsigned long ppc_tb_freq; - extern void pSeries_system_reset_exception(struct pt_regs *regs); extern int pSeries_machine_check_exception(struct pt_regs *regs); @@ -482,70 +479,6 @@ static void pSeries_progress(char *s, unsigned short hex) spin_unlock(&progress_lock); } -extern void setup_default_decr(void); - -/* Some sane defaults: 125 MHz timebase, 1GHz processor */ -#define DEFAULT_TB_FREQ 125000000UL -#define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) - -static void __init pSeries_calibrate_decr(void) -{ - struct device_node *cpu; - struct div_result divres; - unsigned int *fp; - int node_found; - - /* - * The cpu node should have a timebase-frequency property - * to tell us the rate at which the decrementer counts. - */ - cpu = of_find_node_by_type(NULL, "cpu"); - - ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "timebase-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_tb_freq = *fp; - } - } - if (!node_found) - printk(KERN_ERR "WARNING: Estimating decrementer frequency " - "(not found)\n"); - - ppc_proc_freq = DEFAULT_PROC_FREQ; - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "clock-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_proc_freq = *fp; - } - } - if (!node_found) - printk(KERN_ERR "WARNING: Estimating processor frequency " - "(not found)\n"); - - of_node_put(cpu); - - printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n", - ppc_tb_freq/1000000, ppc_tb_freq%1000000); - printk(KERN_INFO "time_init: processor frequency = %lu.%.6lu MHz\n", - ppc_proc_freq/1000000, ppc_proc_freq%1000000); - - tb_ticks_per_jiffy = ppc_tb_freq / HZ; - tb_ticks_per_sec = tb_ticks_per_jiffy * HZ; - tb_ticks_per_usec = ppc_tb_freq / 1000000; - tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); - div128_by_32(1024*1024, 0, tb_ticks_per_sec, &divres); - tb_to_xs = divres.result_low; - - setup_default_decr(); -} - static int pSeries_check_legacy_ioport(unsigned int baseport) { struct device_node *np; @@ -604,7 +537,7 @@ struct machdep_calls __initdata pSeries_md = { .get_boot_time = pSeries_get_boot_time, .get_rtc_time = pSeries_get_rtc_time, .set_rtc_time = pSeries_set_rtc_time, - .calibrate_decr = pSeries_calibrate_decr, + .calibrate_decr = generic_calibrate_decr, .progress = pSeries_progress, .check_legacy_ioport = pSeries_check_legacy_ioport, .system_reset_exception = pSeries_system_reset_exception, diff --git a/arch/ppc64/kernel/pmac_time.c b/arch/ppc64/kernel/pmac_time.c index f24827581dd7..3059edb09cc8 100644 --- a/arch/ppc64/kernel/pmac_time.c +++ b/arch/ppc64/kernel/pmac_time.c @@ -40,11 +40,6 @@ #define DBG(x...) #endif -extern void setup_default_decr(void); - -extern unsigned long ppc_tb_freq; -extern unsigned long ppc_proc_freq; - /* Apparently the RTC stores seconds since 1 Jan 1904 */ #define RTC_OFFSET 2082844800 @@ -161,8 +156,7 @@ void __init pmac_get_boot_time(struct rtc_time *tm) /* * Query the OF and get the decr frequency. - * This was taken from the pmac time_init() when merging the prep/pmac - * time functions. + * FIXME: merge this with generic_calibrate_decr */ void __init pmac_calibrate_decr(void) { diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 8e439a817642..93b0ee88cda1 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -701,9 +701,6 @@ void machine_halt(void) EXPORT_SYMBOL(machine_halt); -unsigned long ppc_proc_freq; -unsigned long ppc_tb_freq; - static int ppc64_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -1117,7 +1114,7 @@ void ppc64_dump_msg(unsigned int src, const char *msg) } /* This should only be called on processor 0 during calibrate decr */ -void setup_default_decr(void) +void __init setup_default_decr(void) { struct paca_struct *lpaca = get_paca(); diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 33364a7d2cd2..2348a75e050d 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -107,6 +107,9 @@ void ppc_adjtimex(void); static unsigned adjusting_time = 0; +unsigned long ppc_proc_freq; +unsigned long ppc_tb_freq; + static __inline__ void timer_check_rtc(void) { /* @@ -472,6 +475,66 @@ int do_settimeofday(struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_MAPLE) || defined(CONFIG_PPC_BPA) +void __init generic_calibrate_decr(void) +{ + struct device_node *cpu; + struct div_result divres; + unsigned int *fp; + int node_found; + + /* + * The cpu node should have a timebase-frequency property + * to tell us the rate at which the decrementer counts. + */ + cpu = of_find_node_by_type(NULL, "cpu"); + + ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ + node_found = 0; + if (cpu != 0) { + fp = (unsigned int *)get_property(cpu, "timebase-frequency", + NULL); + if (fp != 0) { + node_found = 1; + ppc_tb_freq = *fp; + } + } + if (!node_found) + printk(KERN_ERR "WARNING: Estimating decrementer frequency " + "(not found)\n"); + + ppc_proc_freq = DEFAULT_PROC_FREQ; + node_found = 0; + if (cpu != 0) { + fp = (unsigned int *)get_property(cpu, "clock-frequency", + NULL); + if (fp != 0) { + node_found = 1; + ppc_proc_freq = *fp; + } + } + if (!node_found) + printk(KERN_ERR "WARNING: Estimating processor frequency " + "(not found)\n"); + + of_node_put(cpu); + + printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n", + ppc_tb_freq/1000000, ppc_tb_freq%1000000); + printk(KERN_INFO "time_init: processor frequency = %lu.%.6lu MHz\n", + ppc_proc_freq/1000000, ppc_proc_freq%1000000); + + tb_ticks_per_jiffy = ppc_tb_freq / HZ; + tb_ticks_per_sec = tb_ticks_per_jiffy * HZ; + tb_ticks_per_usec = ppc_tb_freq / 1000000; + tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); + div128_by_32(1024*1024, 0, tb_ticks_per_sec, &divres); + tb_to_xs = divres.result_low; + + setup_default_decr(); +} +#endif + void __init time_init(void) { /* This function is only called on the boot processor */ diff --git a/include/asm-ppc64/time.h b/include/asm-ppc64/time.h index 8d6e3760ee10..c6c762cad8b0 100644 --- a/include/asm-ppc64/time.h +++ b/include/asm-ppc64/time.h @@ -34,6 +34,15 @@ struct rtc_time; extern void to_tm(int tim, struct rtc_time * tm); extern time_t last_rtc_update; +void generic_calibrate_decr(void); +void setup_default_decr(void); + +/* Some sane defaults: 125 MHz timebase, 1GHz processor */ +extern unsigned long ppc_proc_freq; +#define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) +extern unsigned long ppc_tb_freq; +#define DEFAULT_TB_FREQ 125000000UL + /* * By putting all of this stuff into a single struct we * reduce the number of cache lines touched by do_gettimeofday. From 773bf9c469c01f01280c9bd45ec2462dd94d08a0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:18 +1000 Subject: [PATCH 010/199] [PATCH] ppc64: rename pSeries rtc functions into rtas_* The rtc rtas functions are not pSeries specific but can also be used by BPA and other SLOF based platforms Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/pSeries_setup.c | 9 +++------ arch/ppc64/kernel/rtc.c | 6 +++--- include/asm-ppc64/rtas.h | 5 +++++ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index f9a310c0c8d7..c5f3ccac7cd1 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -73,9 +73,6 @@ extern void pSeries_final_fixup(void); -extern void pSeries_get_boot_time(struct rtc_time *rtc_time); -extern void pSeries_get_rtc_time(struct rtc_time *rtc_time); -extern int pSeries_set_rtc_time(struct rtc_time *rtc_time); extern void find_udbg_vterm(void); extern void system_reset_fwnmi(void); /* from head.S */ extern void machine_check_fwnmi(void); /* from head.S */ @@ -534,9 +531,9 @@ struct machdep_calls __initdata pSeries_md = { .halt = rtas_halt, .panic = rtas_os_term, .cpu_die = pSeries_mach_cpu_die, - .get_boot_time = pSeries_get_boot_time, - .get_rtc_time = pSeries_get_rtc_time, - .set_rtc_time = pSeries_set_rtc_time, + .get_boot_time = rtas_get_boot_time, + .get_rtc_time = rtas_get_rtc_time, + .set_rtc_time = rtas_set_rtc_time, .calibrate_decr = generic_calibrate_decr, .progress = pSeries_progress, .check_legacy_ioport = pSeries_check_legacy_ioport, diff --git a/arch/ppc64/kernel/rtc.c b/arch/ppc64/kernel/rtc.c index de02aedbe080..d729fefa0df5 100644 --- a/arch/ppc64/kernel/rtc.c +++ b/arch/ppc64/kernel/rtc.c @@ -301,7 +301,7 @@ void iSeries_get_boot_time(struct rtc_time *tm) #ifdef CONFIG_PPC_RTAS #define MAX_RTC_WAIT 5000 /* 5 sec */ #define RTAS_CLOCK_BUSY (-2) -void pSeries_get_boot_time(struct rtc_time *rtc_tm) +void rtas_get_boot_time(struct rtc_time *rtc_tm) { int ret[8]; int error, wait_time; @@ -336,7 +336,7 @@ void pSeries_get_boot_time(struct rtc_time *rtc_tm) * and if a delay is needed to read the clock. In this case we just * silently return without updating rtc_tm. */ -void pSeries_get_rtc_time(struct rtc_time *rtc_tm) +void rtas_get_rtc_time(struct rtc_time *rtc_tm) { int ret[8]; int error, wait_time; @@ -371,7 +371,7 @@ void pSeries_get_rtc_time(struct rtc_time *rtc_tm) rtc_tm->tm_year = ret[0] - 1900; } -int pSeries_set_rtc_time(struct rtc_time *tm) +int rtas_set_rtc_time(struct rtc_time *tm) { int error, wait_time; unsigned long max_wait_tb; diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index a8ab0e9db84a..c46ceb2ffc12 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -188,6 +188,11 @@ extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); extern int rtas_set_indicator(int indicator, int index, int new_value); extern void rtas_initialize(void); +struct rtc_time; +extern void rtas_get_boot_time(struct rtc_time *rtc_time); +extern void rtas_get_rtc_time(struct rtc_time *rtc_time); +extern int rtas_set_rtc_time(struct rtc_time *rtc_time); + /* Given an RTAS status code of 9900..9905 compute the hinted delay */ unsigned int rtas_extended_busy_delay_time(int status); static inline int rtas_is_extended_busy(int status) From c5a3c2e52af1bcb118022ffac9a0fd1d42d43bd3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:23 +1000 Subject: [PATCH 011/199] [PATCH] ppc64: Split out generic rtas code from pSeries_pci.c. BPA is using rtas for PCI but should not be confused by pSeries code. This also avoids some #ifdefs. Other platforms that want to use rtas_pci.c could create their own platform_pci.c with platform specific fixups. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/Makefile | 3 +- arch/ppc64/kernel/mpic.h | 3 + arch/ppc64/kernel/pSeries_pci.c | 476 +----------------------------- arch/ppc64/kernel/rtas_pci.c | 495 ++++++++++++++++++++++++++++++++ 4 files changed, 506 insertions(+), 471 deletions(-) create mode 100644 arch/ppc64/kernel/rtas_pci.c diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index b5e167cf1a05..f389f2453daa 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -31,13 +31,14 @@ obj-$(CONFIG_PPC_MULTIPLATFORM) += nvram.o i8259.o prom_init.o prom.o mpic.o obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \ pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \ - xics.o rtas.o pSeries_setup.o pSeries_iommu.o + xics.o pSeries_setup.o pSeries_iommu.o obj-$(CONFIG_EEH) += eeh.o obj-$(CONFIG_PROC_FS) += proc_ppc64.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_MODULES) += module.o ppc_ksyms.o +obj-$(CONFIG_PPC_RTAS) += rtas.o rtas_pci.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_SCANLOG) += scanlog.o obj-$(CONFIG_VIOPATH) += viopath.o diff --git a/arch/ppc64/kernel/mpic.h b/arch/ppc64/kernel/mpic.h index 571b3c99e062..63e177143eac 100644 --- a/arch/ppc64/kernel/mpic.h +++ b/arch/ppc64/kernel/mpic.h @@ -265,3 +265,6 @@ extern void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask); extern int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs); /* This one gets to the primary mpic */ extern int mpic_get_irq(struct pt_regs *regs); + +/* global mpic for pSeries */ +extern struct mpic *pSeries_mpic; diff --git a/arch/ppc64/kernel/pSeries_pci.c b/arch/ppc64/kernel/pSeries_pci.c index 0b1cca281408..dfa6d3d3e9f0 100644 --- a/arch/ppc64/kernel/pSeries_pci.c +++ b/arch/ppc64/kernel/pSeries_pci.c @@ -1,13 +1,11 @@ /* - * pSeries_pci.c + * arch/ppc64/kernel/pSeries_pci.c * * Copyright (C) 2001 Dave Engebretsen, IBM Corporation * Copyright (C) 2003 Anton Blanchard , IBM * * pSeries specific routines for PCI. * - * Based on code from pci.c and chrp_pci.c - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -23,430 +21,18 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include +#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include -#include "mpic.h" #include "pci.h" -/* RTAS tokens */ -static int read_pci_config; -static int write_pci_config; -static int ibm_read_pci_config; -static int ibm_write_pci_config; - -static int s7a_workaround; - -extern struct mpic *pSeries_mpic; - -static int config_access_valid(struct device_node *dn, int where) -{ - if (where < 256) - return 1; - if (where < 4096 && dn->pci_ext_config_space) - return 1; - - return 0; -} - -static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) -{ - int returnval = -1; - unsigned long buid, addr; - int ret; - - if (!dn) - return PCIBIOS_DEVICE_NOT_FOUND; - if (!config_access_valid(dn, where)) - return PCIBIOS_BAD_REGISTER_NUMBER; - - addr = ((where & 0xf00) << 20) | (dn->busno << 16) | - (dn->devfn << 8) | (where & 0xff); - buid = dn->phb->buid; - if (buid) { - ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval, - addr, buid >> 32, buid & 0xffffffff, size); - } else { - ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size); - } - *val = returnval; - - if (ret) - return PCIBIOS_DEVICE_NOT_FOUND; - - if (returnval == EEH_IO_ERROR_VALUE(size) - && eeh_dn_check_failure (dn, NULL)) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; -} - -static int rtas_pci_read_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 *val) -{ - struct device_node *busdn, *dn; - - if (bus->self) - busdn = pci_device_to_OF_node(bus->self); - else - busdn = bus->sysdata; /* must be a phb */ - - /* Search only direct children of the bus */ - for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) - return rtas_read_config(dn, where, size, val); - return PCIBIOS_DEVICE_NOT_FOUND; -} - -static int rtas_write_config(struct device_node *dn, int where, int size, u32 val) -{ - unsigned long buid, addr; - int ret; - - if (!dn) - return PCIBIOS_DEVICE_NOT_FOUND; - if (!config_access_valid(dn, where)) - return PCIBIOS_BAD_REGISTER_NUMBER; - - addr = ((where & 0xf00) << 20) | (dn->busno << 16) | - (dn->devfn << 8) | (where & 0xff); - buid = dn->phb->buid; - if (buid) { - ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, buid >> 32, buid & 0xffffffff, size, (ulong) val); - } else { - ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); - } - - if (ret) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; -} - -static int rtas_pci_write_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 val) -{ - struct device_node *busdn, *dn; - - if (bus->self) - busdn = pci_device_to_OF_node(bus->self); - else - busdn = bus->sysdata; /* must be a phb */ - - /* Search only direct children of the bus */ - for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) - return rtas_write_config(dn, where, size, val); - return PCIBIOS_DEVICE_NOT_FOUND; -} - -struct pci_ops rtas_pci_ops = { - rtas_pci_read_config, - rtas_pci_write_config -}; - -int is_python(struct device_node *dev) -{ - char *model = (char *)get_property(dev, "model", NULL); - - if (model && strstr(model, "Python")) - return 1; - - return 0; -} - -static int get_phb_reg_prop(struct device_node *dev, - unsigned int addr_size_words, - struct reg_property64 *reg) -{ - unsigned int *ui_ptr = NULL, len; - - /* Found a PHB, now figure out where his registers are mapped. */ - ui_ptr = (unsigned int *)get_property(dev, "reg", &len); - if (ui_ptr == NULL) - return 1; - - if (addr_size_words == 1) { - reg->address = ((struct reg_property32 *)ui_ptr)->address; - reg->size = ((struct reg_property32 *)ui_ptr)->size; - } else { - *reg = *((struct reg_property64 *)ui_ptr); - } - - return 0; -} - -static void python_countermeasures(struct device_node *dev, - unsigned int addr_size_words) -{ - struct reg_property64 reg_struct; - void __iomem *chip_regs; - volatile u32 val; - - if (get_phb_reg_prop(dev, addr_size_words, ®_struct)) - return; - - /* Python's register file is 1 MB in size. */ - chip_regs = ioremap(reg_struct.address & ~(0xfffffUL), 0x100000); - - /* - * Firmware doesn't always clear this bit which is critical - * for good performance - Anton - */ - -#define PRG_CL_RESET_VALID 0x00010000 - - val = in_be32(chip_regs + 0xf6030); - if (val & PRG_CL_RESET_VALID) { - printk(KERN_INFO "Python workaround: "); - val &= ~PRG_CL_RESET_VALID; - out_be32(chip_regs + 0xf6030, val); - /* - * We must read it back for changes to - * take effect - */ - val = in_be32(chip_regs + 0xf6030); - printk("reg0: %x\n", val); - } - - iounmap(chip_regs); -} - -void __init init_pci_config_tokens (void) -{ - read_pci_config = rtas_token("read-pci-config"); - write_pci_config = rtas_token("write-pci-config"); - ibm_read_pci_config = rtas_token("ibm,read-pci-config"); - ibm_write_pci_config = rtas_token("ibm,write-pci-config"); -} - -unsigned long __devinit get_phb_buid (struct device_node *phb) -{ - int addr_cells; - unsigned int *buid_vals; - unsigned int len; - unsigned long buid; - - if (ibm_read_pci_config == -1) return 0; - - /* PHB's will always be children of the root node, - * or so it is promised by the current firmware. */ - if (phb->parent == NULL) - return 0; - if (phb->parent->parent) - return 0; - - buid_vals = (unsigned int *) get_property(phb, "reg", &len); - if (buid_vals == NULL) - return 0; - - addr_cells = prom_n_addr_cells(phb); - if (addr_cells == 1) { - buid = (unsigned long) buid_vals[0]; - } else { - buid = (((unsigned long)buid_vals[0]) << 32UL) | - (((unsigned long)buid_vals[1]) & 0xffffffff); - } - return buid; -} - -static int phb_set_bus_ranges(struct device_node *dev, - struct pci_controller *phb) -{ - int *bus_range; - unsigned int len; - - bus_range = (int *) get_property(dev, "bus-range", &len); - if (bus_range == NULL || len < 2 * sizeof(int)) { - return 1; - } - - phb->first_busno = bus_range[0]; - phb->last_busno = bus_range[1]; - - return 0; -} - -static int __devinit setup_phb(struct device_node *dev, - struct pci_controller *phb, - unsigned int addr_size_words) -{ - pci_setup_pci_controller(phb); - - if (is_python(dev)) - python_countermeasures(dev, addr_size_words); - - if (phb_set_bus_ranges(dev, phb)) - return 1; - - phb->arch_data = dev; - phb->ops = &rtas_pci_ops; - phb->buid = get_phb_buid(dev); - - return 0; -} - -static void __devinit add_linux_pci_domain(struct device_node *dev, - struct pci_controller *phb, - struct property *of_prop) -{ - memset(of_prop, 0, sizeof(struct property)); - of_prop->name = "linux,pci-domain"; - of_prop->length = sizeof(phb->global_number); - of_prop->value = (unsigned char *)&of_prop[1]; - memcpy(of_prop->value, &phb->global_number, sizeof(phb->global_number)); - prom_add_property(dev, of_prop); -} - -static struct pci_controller * __init alloc_phb(struct device_node *dev, - unsigned int addr_size_words) -{ - struct pci_controller *phb; - struct property *of_prop; - - phb = alloc_bootmem(sizeof(struct pci_controller)); - if (phb == NULL) - return NULL; - - of_prop = alloc_bootmem(sizeof(struct property) + - sizeof(phb->global_number)); - if (!of_prop) - return NULL; - - if (setup_phb(dev, phb, addr_size_words)) - return NULL; - - add_linux_pci_domain(dev, phb, of_prop); - - return phb; -} - -static struct pci_controller * __devinit alloc_phb_dynamic(struct device_node *dev, unsigned int addr_size_words) -{ - struct pci_controller *phb; - - phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller), - GFP_KERNEL); - if (phb == NULL) - return NULL; - - if (setup_phb(dev, phb, addr_size_words)) - return NULL; - - phb->is_dynamic = 1; - - /* TODO: linux,pci-domain? */ - - return phb; -} - -unsigned long __init find_and_init_phbs(void) -{ - struct device_node *node; - struct pci_controller *phb; - unsigned int root_size_cells = 0; - unsigned int index; - unsigned int *opprop = NULL; - struct device_node *root = of_find_node_by_path("/"); - - if (ppc64_interrupt_controller == IC_OPEN_PIC) { - opprop = (unsigned int *)get_property(root, - "platform-open-pic", NULL); - } - - root_size_cells = prom_n_size_cells(root); - - index = 0; - - for (node = of_get_next_child(root, NULL); - node != NULL; - node = of_get_next_child(root, node)) { - if (node->type == NULL || strcmp(node->type, "pci") != 0) - continue; - - phb = alloc_phb(node, root_size_cells); - if (!phb) - continue; - - pci_process_bridge_OF_ranges(phb, node); - pci_setup_phb_io(phb, index == 0); - - if (ppc64_interrupt_controller == IC_OPEN_PIC && pSeries_mpic) { - int addr = root_size_cells * (index + 2) - 1; - mpic_assign_isu(pSeries_mpic, index, opprop[addr]); - } - - index++; - } - - of_node_put(root); - pci_devs_phb_init(); - - /* - * pci_probe_only and pci_assign_all_buses can be set via properties - * in chosen. - */ - if (of_chosen) { - int *prop; - - prop = (int *)get_property(of_chosen, "linux,pci-probe-only", - NULL); - if (prop) - pci_probe_only = *prop; - - prop = (int *)get_property(of_chosen, - "linux,pci-assign-all-buses", NULL); - if (prop) - pci_assign_all_buses = *prop; - } - - return 0; -} - -struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) -{ - struct device_node *root = of_find_node_by_path("/"); - unsigned int root_size_cells = 0; - struct pci_controller *phb; - struct pci_bus *bus; - int primary; - - root_size_cells = prom_n_size_cells(root); - - primary = list_empty(&hose_list); - phb = alloc_phb_dynamic(dn, root_size_cells); - if (!phb) - return NULL; - - pci_process_bridge_OF_ranges(phb, dn); - - pci_setup_phb_io_dynamic(phb, primary); - of_node_put(root); - - pci_devs_phb_init_dynamic(phb); - phb->last_busno = 0xff; - bus = pci_scan_bus(phb->first_busno, phb->ops, phb->arch_data); - phb->bus = bus; - phb->last_busno = bus->subordinate; - - return phb; -} -EXPORT_SYMBOL(init_phb_dynamic); +static int __initdata s7a_workaround; #if 0 void pcibios_name_device(struct pci_dev *dev) @@ -474,7 +60,7 @@ void pcibios_name_device(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device); #endif -static void check_s7a(void) +static void __init check_s7a(void) { struct device_node *root; char *model; @@ -488,56 +74,6 @@ static void check_s7a(void) } } -/* RPA-specific bits for removing PHBs */ -int pcibios_remove_root_bus(struct pci_controller *phb) -{ - struct pci_bus *b = phb->bus; - struct resource *res; - int rc, i; - - res = b->resource[0]; - if (!res->flags) { - printk(KERN_ERR "%s: no IO resource for PHB %s\n", __FUNCTION__, - b->name); - return 1; - } - - rc = unmap_bus_range(b); - if (rc) { - printk(KERN_ERR "%s: failed to unmap IO on bus %s\n", - __FUNCTION__, b->name); - return 1; - } - - if (release_resource(res)) { - printk(KERN_ERR "%s: failed to release IO on bus %s\n", - __FUNCTION__, b->name); - return 1; - } - - for (i = 1; i < 3; ++i) { - res = b->resource[i]; - if (!res->flags && i == 0) { - printk(KERN_ERR "%s: no MEM resource for PHB %s\n", - __FUNCTION__, b->name); - return 1; - } - if (res->flags && release_resource(res)) { - printk(KERN_ERR - "%s: failed to release IO %d on bus %s\n", - __FUNCTION__, i, b->name); - return 1; - } - } - - list_del(&phb->list_node); - if (phb->is_dynamic) - kfree(phb); - - return 0; -} -EXPORT_SYMBOL(pcibios_remove_root_bus); - static void __init pSeries_request_regions(void) { if (!isa_io_base) diff --git a/arch/ppc64/kernel/rtas_pci.c b/arch/ppc64/kernel/rtas_pci.c new file mode 100644 index 000000000000..1048817befb8 --- /dev/null +++ b/arch/ppc64/kernel/rtas_pci.c @@ -0,0 +1,495 @@ +/* + * arch/ppc64/kernel/rtas_pci.c + * + * Copyright (C) 2001 Dave Engebretsen, IBM Corporation + * Copyright (C) 2003 Anton Blanchard , IBM + * + * RTAS specific routines for PCI. + * + * Based on code from pci.c, chrp_pci.c and pSeries_pci.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpic.h" +#include "pci.h" + +/* RTAS tokens */ +static int read_pci_config; +static int write_pci_config; +static int ibm_read_pci_config; +static int ibm_write_pci_config; + +static int config_access_valid(struct device_node *dn, int where) +{ + if (where < 256) + return 1; + if (where < 4096 && dn->pci_ext_config_space) + return 1; + + return 0; +} + +static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) +{ + int returnval = -1; + unsigned long buid, addr; + int ret; + + if (!dn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(dn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = ((where & 0xf00) << 20) | (dn->busno << 16) | + (dn->devfn << 8) | (where & 0xff); + buid = dn->phb->buid; + if (buid) { + ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval, + addr, buid >> 32, buid & 0xffffffff, size); + } else { + ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size); + } + *val = returnval; + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (returnval == EEH_IO_ERROR_VALUE(size) + && eeh_dn_check_failure (dn, NULL)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct device_node *busdn, *dn; + + if (bus->self) + busdn = pci_device_to_OF_node(bus->self); + else + busdn = bus->sysdata; /* must be a phb */ + + /* Search only direct children of the bus */ + for (dn = busdn->child; dn; dn = dn->sibling) + if (dn->devfn == devfn) + return rtas_read_config(dn, where, size, val); + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static int rtas_write_config(struct device_node *dn, int where, int size, u32 val) +{ + unsigned long buid, addr; + int ret; + + if (!dn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(dn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = ((where & 0xf00) << 20) | (dn->busno << 16) | + (dn->devfn << 8) | (where & 0xff); + buid = dn->phb->buid; + if (buid) { + ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, buid >> 32, buid & 0xffffffff, size, (ulong) val); + } else { + ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); + } + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct device_node *busdn, *dn; + + if (bus->self) + busdn = pci_device_to_OF_node(bus->self); + else + busdn = bus->sysdata; /* must be a phb */ + + /* Search only direct children of the bus */ + for (dn = busdn->child; dn; dn = dn->sibling) + if (dn->devfn == devfn) + return rtas_write_config(dn, where, size, val); + return PCIBIOS_DEVICE_NOT_FOUND; +} + +struct pci_ops rtas_pci_ops = { + rtas_pci_read_config, + rtas_pci_write_config +}; + +int is_python(struct device_node *dev) +{ + char *model = (char *)get_property(dev, "model", NULL); + + if (model && strstr(model, "Python")) + return 1; + + return 0; +} + +static int get_phb_reg_prop(struct device_node *dev, + unsigned int addr_size_words, + struct reg_property64 *reg) +{ + unsigned int *ui_ptr = NULL, len; + + /* Found a PHB, now figure out where his registers are mapped. */ + ui_ptr = (unsigned int *)get_property(dev, "reg", &len); + if (ui_ptr == NULL) + return 1; + + if (addr_size_words == 1) { + reg->address = ((struct reg_property32 *)ui_ptr)->address; + reg->size = ((struct reg_property32 *)ui_ptr)->size; + } else { + *reg = *((struct reg_property64 *)ui_ptr); + } + + return 0; +} + +static void python_countermeasures(struct device_node *dev, + unsigned int addr_size_words) +{ + struct reg_property64 reg_struct; + void __iomem *chip_regs; + volatile u32 val; + + if (get_phb_reg_prop(dev, addr_size_words, ®_struct)) + return; + + /* Python's register file is 1 MB in size. */ + chip_regs = ioremap(reg_struct.address & ~(0xfffffUL), 0x100000); + + /* + * Firmware doesn't always clear this bit which is critical + * for good performance - Anton + */ + +#define PRG_CL_RESET_VALID 0x00010000 + + val = in_be32(chip_regs + 0xf6030); + if (val & PRG_CL_RESET_VALID) { + printk(KERN_INFO "Python workaround: "); + val &= ~PRG_CL_RESET_VALID; + out_be32(chip_regs + 0xf6030, val); + /* + * We must read it back for changes to + * take effect + */ + val = in_be32(chip_regs + 0xf6030); + printk("reg0: %x\n", val); + } + + iounmap(chip_regs); +} + +void __init init_pci_config_tokens (void) +{ + read_pci_config = rtas_token("read-pci-config"); + write_pci_config = rtas_token("write-pci-config"); + ibm_read_pci_config = rtas_token("ibm,read-pci-config"); + ibm_write_pci_config = rtas_token("ibm,write-pci-config"); +} + +unsigned long __devinit get_phb_buid (struct device_node *phb) +{ + int addr_cells; + unsigned int *buid_vals; + unsigned int len; + unsigned long buid; + + if (ibm_read_pci_config == -1) return 0; + + /* PHB's will always be children of the root node, + * or so it is promised by the current firmware. */ + if (phb->parent == NULL) + return 0; + if (phb->parent->parent) + return 0; + + buid_vals = (unsigned int *) get_property(phb, "reg", &len); + if (buid_vals == NULL) + return 0; + + addr_cells = prom_n_addr_cells(phb); + if (addr_cells == 1) { + buid = (unsigned long) buid_vals[0]; + } else { + buid = (((unsigned long)buid_vals[0]) << 32UL) | + (((unsigned long)buid_vals[1]) & 0xffffffff); + } + return buid; +} + +static int phb_set_bus_ranges(struct device_node *dev, + struct pci_controller *phb) +{ + int *bus_range; + unsigned int len; + + bus_range = (int *) get_property(dev, "bus-range", &len); + if (bus_range == NULL || len < 2 * sizeof(int)) { + return 1; + } + + phb->first_busno = bus_range[0]; + phb->last_busno = bus_range[1]; + + return 0; +} + +static int __devinit setup_phb(struct device_node *dev, + struct pci_controller *phb, + unsigned int addr_size_words) +{ + pci_setup_pci_controller(phb); + + if (is_python(dev)) + python_countermeasures(dev, addr_size_words); + + if (phb_set_bus_ranges(dev, phb)) + return 1; + + phb->arch_data = dev; + phb->ops = &rtas_pci_ops; + phb->buid = get_phb_buid(dev); + + return 0; +} + +static void __devinit add_linux_pci_domain(struct device_node *dev, + struct pci_controller *phb, + struct property *of_prop) +{ + memset(of_prop, 0, sizeof(struct property)); + of_prop->name = "linux,pci-domain"; + of_prop->length = sizeof(phb->global_number); + of_prop->value = (unsigned char *)&of_prop[1]; + memcpy(of_prop->value, &phb->global_number, sizeof(phb->global_number)); + prom_add_property(dev, of_prop); +} + +static struct pci_controller * __init alloc_phb(struct device_node *dev, + unsigned int addr_size_words) +{ + struct pci_controller *phb; + struct property *of_prop; + + phb = alloc_bootmem(sizeof(struct pci_controller)); + if (phb == NULL) + return NULL; + + of_prop = alloc_bootmem(sizeof(struct property) + + sizeof(phb->global_number)); + if (!of_prop) + return NULL; + + if (setup_phb(dev, phb, addr_size_words)) + return NULL; + + add_linux_pci_domain(dev, phb, of_prop); + + return phb; +} + +static struct pci_controller * __devinit alloc_phb_dynamic(struct device_node *dev, unsigned int addr_size_words) +{ + struct pci_controller *phb; + + phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller), + GFP_KERNEL); + if (phb == NULL) + return NULL; + + if (setup_phb(dev, phb, addr_size_words)) + return NULL; + + phb->is_dynamic = 1; + + /* TODO: linux,pci-domain? */ + + return phb; +} + +unsigned long __init find_and_init_phbs(void) +{ + struct device_node *node; + struct pci_controller *phb; + unsigned int root_size_cells = 0; + unsigned int index; + unsigned int *opprop = NULL; + struct device_node *root = of_find_node_by_path("/"); + + if (ppc64_interrupt_controller == IC_OPEN_PIC) { + opprop = (unsigned int *)get_property(root, + "platform-open-pic", NULL); + } + + root_size_cells = prom_n_size_cells(root); + + index = 0; + + for (node = of_get_next_child(root, NULL); + node != NULL; + node = of_get_next_child(root, node)) { + if (node->type == NULL || strcmp(node->type, "pci") != 0) + continue; + + phb = alloc_phb(node, root_size_cells); + if (!phb) + continue; + + pci_process_bridge_OF_ranges(phb, node); + pci_setup_phb_io(phb, index == 0); +#ifdef CONFIG_PPC_PSERIES + if (ppc64_interrupt_controller == IC_OPEN_PIC && pSeries_mpic) { + int addr = root_size_cells * (index + 2) - 1; + mpic_assign_isu(pSeries_mpic, index, opprop[addr]); + } +#endif + index++; + } + + of_node_put(root); + pci_devs_phb_init(); + + /* + * pci_probe_only and pci_assign_all_buses can be set via properties + * in chosen. + */ + if (of_chosen) { + int *prop; + + prop = (int *)get_property(of_chosen, "linux,pci-probe-only", + NULL); + if (prop) + pci_probe_only = *prop; + + prop = (int *)get_property(of_chosen, + "linux,pci-assign-all-buses", NULL); + if (prop) + pci_assign_all_buses = *prop; + } + + return 0; +} + +struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) +{ + struct device_node *root = of_find_node_by_path("/"); + unsigned int root_size_cells = 0; + struct pci_controller *phb; + struct pci_bus *bus; + int primary; + + root_size_cells = prom_n_size_cells(root); + + primary = list_empty(&hose_list); + phb = alloc_phb_dynamic(dn, root_size_cells); + if (!phb) + return NULL; + + pci_process_bridge_OF_ranges(phb, dn); + + pci_setup_phb_io_dynamic(phb, primary); + of_node_put(root); + + pci_devs_phb_init_dynamic(phb); + phb->last_busno = 0xff; + bus = pci_scan_bus(phb->first_busno, phb->ops, phb->arch_data); + phb->bus = bus; + phb->last_busno = bus->subordinate; + + return phb; +} +EXPORT_SYMBOL(init_phb_dynamic); + +/* RPA-specific bits for removing PHBs */ +int pcibios_remove_root_bus(struct pci_controller *phb) +{ + struct pci_bus *b = phb->bus; + struct resource *res; + int rc, i; + + res = b->resource[0]; + if (!res->flags) { + printk(KERN_ERR "%s: no IO resource for PHB %s\n", __FUNCTION__, + b->name); + return 1; + } + + rc = unmap_bus_range(b); + if (rc) { + printk(KERN_ERR "%s: failed to unmap IO on bus %s\n", + __FUNCTION__, b->name); + return 1; + } + + if (release_resource(res)) { + printk(KERN_ERR "%s: failed to release IO on bus %s\n", + __FUNCTION__, b->name); + return 1; + } + + for (i = 1; i < 3; ++i) { + res = b->resource[i]; + if (!res->flags && i == 0) { + printk(KERN_ERR "%s: no MEM resource for PHB %s\n", + __FUNCTION__, b->name); + return 1; + } + if (res->flags && release_resource(res)) { + printk(KERN_ERR + "%s: failed to release IO %d on bus %s\n", + __FUNCTION__, i, b->name); + return 1; + } + } + + list_del(&phb->list_node); + if (phb->is_dynamic) + kfree(phb); + + return 0; +} +EXPORT_SYMBOL(pcibios_remove_root_bus); From 6566c6f1f18d42affe73ccdd403e290b64d10473 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:28 +1000 Subject: [PATCH 012/199] [PATCH] ppc64: pSeries_progress -> rtas_progress The pSeries_progress function is called from some places in the rtas code, which may also be used by non-pSeries platforms. Though pSeries is currently the only platform type that implements display-character, the code is actually generic enough to be part of the rtas subsystem. I hit a bug here because the generic rtas code tried calling ppc_md.progress, which points to an __init function on most platforms. We could also clear the ppc_md.progress pointer when freeing the init memory to make it more explicit that ppc_md.progress must not be called after bootup. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/pSeries_setup.c | 103 +---------------------------- arch/ppc64/kernel/rtas-proc.c | 4 +- arch/ppc64/kernel/rtas.c | 106 +++++++++++++++++++++++++++++- include/asm-ppc64/rtas.h | 1 + 4 files changed, 108 insertions(+), 106 deletions(-) diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index c5f3ccac7cd1..41e6de2c9158 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -375,107 +375,6 @@ static void __init pSeries_init_early(void) } -static void pSeries_progress(char *s, unsigned short hex) -{ - struct device_node *root; - int width, *p; - char *os; - static int display_character, set_indicator; - static int max_width; - static DEFINE_SPINLOCK(progress_lock); - static int pending_newline = 0; /* did last write end with unprinted newline? */ - - if (!rtas.base) - return; - - if (max_width == 0) { - if ((root = find_path_device("/rtas")) && - (p = (unsigned int *)get_property(root, - "ibm,display-line-length", - NULL))) - max_width = *p; - else - max_width = 0x10; - display_character = rtas_token("display-character"); - set_indicator = rtas_token("set-indicator"); - } - - if (display_character == RTAS_UNKNOWN_SERVICE) { - /* use hex display if available */ - if (set_indicator != RTAS_UNKNOWN_SERVICE) - rtas_call(set_indicator, 3, 1, NULL, 6, 0, hex); - return; - } - - spin_lock(&progress_lock); - - /* - * Last write ended with newline, but we didn't print it since - * it would just clear the bottom line of output. Print it now - * instead. - * - * If no newline is pending, print a CR to start output at the - * beginning of the line. - */ - if (pending_newline) { - rtas_call(display_character, 1, 1, NULL, '\r'); - rtas_call(display_character, 1, 1, NULL, '\n'); - pending_newline = 0; - } else { - rtas_call(display_character, 1, 1, NULL, '\r'); - } - - width = max_width; - os = s; - while (*os) { - if (*os == '\n' || *os == '\r') { - /* Blank to end of line. */ - while (width-- > 0) - rtas_call(display_character, 1, 1, NULL, ' '); - - /* If newline is the last character, save it - * until next call to avoid bumping up the - * display output. - */ - if (*os == '\n' && !os[1]) { - pending_newline = 1; - spin_unlock(&progress_lock); - return; - } - - /* RTAS wants CR-LF, not just LF */ - - if (*os == '\n') { - rtas_call(display_character, 1, 1, NULL, '\r'); - rtas_call(display_character, 1, 1, NULL, '\n'); - } else { - /* CR might be used to re-draw a line, so we'll - * leave it alone and not add LF. - */ - rtas_call(display_character, 1, 1, NULL, *os); - } - - width = max_width; - } else { - width--; - rtas_call(display_character, 1, 1, NULL, *os); - } - - os++; - - /* if we overwrite the screen length */ - if (width <= 0) - while ((*os != 0) && (*os != '\n') && (*os != '\r')) - os++; - } - - /* Blank to end of line. */ - while (width-- > 0) - rtas_call(display_character, 1, 1, NULL, ' '); - - spin_unlock(&progress_lock); -} - static int pSeries_check_legacy_ioport(unsigned int baseport) { struct device_node *np; @@ -535,7 +434,7 @@ struct machdep_calls __initdata pSeries_md = { .get_rtc_time = rtas_get_rtc_time, .set_rtc_time = rtas_set_rtc_time, .calibrate_decr = generic_calibrate_decr, - .progress = pSeries_progress, + .progress = rtas_progress, .check_legacy_ioport = pSeries_check_legacy_ioport, .system_reset_exception = pSeries_system_reset_exception, .machine_check_exception = pSeries_machine_check_exception, diff --git a/arch/ppc64/kernel/rtas-proc.c b/arch/ppc64/kernel/rtas-proc.c index 28b1f1521f21..1f3ff860fdf0 100644 --- a/arch/ppc64/kernel/rtas-proc.c +++ b/arch/ppc64/kernel/rtas-proc.c @@ -371,11 +371,11 @@ static ssize_t ppc_rtas_progress_write(struct file *file, /* Lets see if the user passed hexdigits */ hex = simple_strtoul(progress_led, NULL, 10); - ppc_md.progress ((char *)progress_led, hex); + rtas_progress ((char *)progress_led, hex); return count; /* clear the line */ - /* ppc_md.progress(" ", 0xffff);*/ + /* rtas_progress(" ", 0xffff);*/ } /* ****************************************************************** */ static int ppc_rtas_progress_show(struct seq_file *m, void *v) diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c index 5575603def27..43e1518653d5 100644 --- a/arch/ppc64/kernel/rtas.c +++ b/arch/ppc64/kernel/rtas.c @@ -91,6 +91,108 @@ call_rtas_display_status_delay(unsigned char c) } } +void +rtas_progress(char *s, unsigned short hex) +{ + struct device_node *root; + int width, *p; + char *os; + static int display_character, set_indicator; + static int max_width; + static DEFINE_SPINLOCK(progress_lock); + static int pending_newline = 0; /* did last write end with unprinted newline? */ + + if (!rtas.base) + return; + + if (max_width == 0) { + if ((root = find_path_device("/rtas")) && + (p = (unsigned int *)get_property(root, + "ibm,display-line-length", + NULL))) + max_width = *p; + else + max_width = 0x10; + display_character = rtas_token("display-character"); + set_indicator = rtas_token("set-indicator"); + } + + if (display_character == RTAS_UNKNOWN_SERVICE) { + /* use hex display if available */ + if (set_indicator != RTAS_UNKNOWN_SERVICE) + rtas_call(set_indicator, 3, 1, NULL, 6, 0, hex); + return; + } + + spin_lock(&progress_lock); + + /* + * Last write ended with newline, but we didn't print it since + * it would just clear the bottom line of output. Print it now + * instead. + * + * If no newline is pending, print a CR to start output at the + * beginning of the line. + */ + if (pending_newline) { + rtas_call(display_character, 1, 1, NULL, '\r'); + rtas_call(display_character, 1, 1, NULL, '\n'); + pending_newline = 0; + } else { + rtas_call(display_character, 1, 1, NULL, '\r'); + } + + width = max_width; + os = s; + while (*os) { + if (*os == '\n' || *os == '\r') { + /* Blank to end of line. */ + while (width-- > 0) + rtas_call(display_character, 1, 1, NULL, ' '); + + /* If newline is the last character, save it + * until next call to avoid bumping up the + * display output. + */ + if (*os == '\n' && !os[1]) { + pending_newline = 1; + spin_unlock(&progress_lock); + return; + } + + /* RTAS wants CR-LF, not just LF */ + + if (*os == '\n') { + rtas_call(display_character, 1, 1, NULL, '\r'); + rtas_call(display_character, 1, 1, NULL, '\n'); + } else { + /* CR might be used to re-draw a line, so we'll + * leave it alone and not add LF. + */ + rtas_call(display_character, 1, 1, NULL, *os); + } + + width = max_width; + } else { + width--; + rtas_call(display_character, 1, 1, NULL, *os); + } + + os++; + + /* if we overwrite the screen length */ + if (width <= 0) + while ((*os != 0) && (*os != '\n') && (*os != '\r')) + os++; + } + + /* Blank to end of line. */ + while (width-- > 0) + rtas_call(display_character, 1, 1, NULL, ' '); + + spin_unlock(&progress_lock); +} + int rtas_token(const char *service) { @@ -425,8 +527,8 @@ rtas_flash_firmware(void) printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size); printk(KERN_ALERT "FLASH: performing flash and reboot\n"); - ppc_md.progress("Flashing \n", 0x0); - ppc_md.progress("Please Wait... ", 0x0); + rtas_progress("Flashing \n", 0x0); + rtas_progress("Please Wait... ", 0x0); printk(KERN_ALERT "FLASH: this will take several minutes. Do not power off!\n"); status = rtas_call(update_token, 1, 1, NULL, rtas_block_list); switch (status) { /* should only get "bad" status */ diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index c46ceb2ffc12..e7d1b5222802 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -186,6 +186,7 @@ extern int rtas_get_sensor(int sensor, int index, int *state); extern int rtas_get_power_level(int powerdomain, int *level); extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); extern int rtas_set_indicator(int indicator, int index, int new_value); +extern void rtas_progress(char *s, unsigned short hex); extern void rtas_initialize(void); struct rtc_time; From 5f5b4e669a59be1cf8fc9d6d04ff1ccad8ab6de0 Mon Sep 17 00:00:00 2001 From: Utz Bacher Date: Thu, 23 Jun 2005 09:43:31 +1000 Subject: [PATCH 013/199] [PATCH] ppc64: add a minimal nvram driver The firmware provides the location and size of the nvram in the device tree, so it does not really contain any hardware specific bits and could be used on other machines as well. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/bpa_nvram.c | 118 ++++++++++++++++++++++++++++++++++ include/asm-ppc64/nvram.h | 1 + 2 files changed, 119 insertions(+) create mode 100644 arch/ppc64/kernel/bpa_nvram.c diff --git a/arch/ppc64/kernel/bpa_nvram.c b/arch/ppc64/kernel/bpa_nvram.c new file mode 100644 index 000000000000..06a119cfceb5 --- /dev/null +++ b/arch/ppc64/kernel/bpa_nvram.c @@ -0,0 +1,118 @@ +/* + * NVRAM for CPBW + * + * (C) Copyright IBM Corp. 2005 + * + * Authors : Utz Bacher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +static void __iomem *bpa_nvram_start; +static long bpa_nvram_len; +static spinlock_t bpa_nvram_lock = SPIN_LOCK_UNLOCKED; + +static ssize_t bpa_nvram_read(char *buf, size_t count, loff_t *index) +{ + unsigned long flags; + + if (*index >= bpa_nvram_len) + return 0; + if (*index + count > bpa_nvram_len) + count = bpa_nvram_len - *index; + + spin_lock_irqsave(&bpa_nvram_lock, flags); + + memcpy_fromio(buf, bpa_nvram_start + *index, count); + + spin_unlock_irqrestore(&bpa_nvram_lock, flags); + + *index += count; + return count; +} + +static ssize_t bpa_nvram_write(char *buf, size_t count, loff_t *index) +{ + unsigned long flags; + + if (*index >= bpa_nvram_len) + return 0; + if (*index + count > bpa_nvram_len) + count = bpa_nvram_len - *index; + + spin_lock_irqsave(&bpa_nvram_lock, flags); + + memcpy_toio(bpa_nvram_start + *index, buf, count); + + spin_unlock_irqrestore(&bpa_nvram_lock, flags); + + *index += count; + return count; +} + +static ssize_t bpa_nvram_get_size(void) +{ + return bpa_nvram_len; +} + +int __init bpa_nvram_init(void) +{ + struct device_node *nvram_node; + unsigned long *buffer; + int proplen; + unsigned long nvram_addr; + int ret; + + ret = -ENODEV; + nvram_node = of_find_node_by_type(NULL, "nvram"); + if (!nvram_node) + goto out; + + ret = -EIO; + buffer = (unsigned long *)get_property(nvram_node, "reg", &proplen); + if (proplen != 2*sizeof(unsigned long)) + goto out; + + ret = -ENODEV; + nvram_addr = buffer[0]; + bpa_nvram_len = buffer[1]; + if ( (!bpa_nvram_len) || (!nvram_addr) ) + goto out; + + bpa_nvram_start = ioremap(nvram_addr, bpa_nvram_len); + if (!bpa_nvram_start) + goto out; + + printk(KERN_INFO "BPA NVRAM, %luk mapped to %p\n", + bpa_nvram_len >> 10, bpa_nvram_start); + + ppc_md.nvram_read = bpa_nvram_read; + ppc_md.nvram_write = bpa_nvram_write; + ppc_md.nvram_size = bpa_nvram_get_size; + +out: + of_node_put(nvram_node); + return ret; +} diff --git a/include/asm-ppc64/nvram.h b/include/asm-ppc64/nvram.h index 4e6dd370d936..dfaa21566c9a 100644 --- a/include/asm-ppc64/nvram.h +++ b/include/asm-ppc64/nvram.h @@ -70,6 +70,7 @@ extern struct nvram_partition *nvram_find_partition(int sig, const char *name); extern int pSeries_nvram_init(void); extern int pmac_nvram_init(void); +extern int bpa_nvram_init(void); /* PowerMac specific nvram stuffs */ From 031f7edecf46d731673a5dd19ecb0de38f1a2219 Mon Sep 17 00:00:00 2001 From: Utz Bacher Date: Thu, 23 Jun 2005 09:43:34 +1000 Subject: [PATCH 014/199] [PATCH] ppc64: add a watchdog driver for rtas Add a watchdog using the RTAS OS surveillance service. This is provided as a simpler alternative to rtasd. The added value is that it works with standard watchdog client programs and can therefore also do user space monitoring. On BPA, rtasd is not really useful because the hardware does not have much to report with event-scan. The driver should also work on other platforms that support the OS surveillance rtas calls. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- drivers/char/watchdog/Kconfig | 10 + drivers/char/watchdog/Makefile | 1 + drivers/char/watchdog/wdrtas.c | 696 +++++++++++++++++++++++++++++++++ 3 files changed, 707 insertions(+) create mode 100644 drivers/char/watchdog/wdrtas.c diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig index 06a31da2381c..b53e2e2b5aee 100644 --- a/drivers/char/watchdog/Kconfig +++ b/drivers/char/watchdog/Kconfig @@ -414,6 +414,16 @@ config WATCHDOG_RIO machines. The watchdog timeout period is normally one minute but can be changed with a boot-time parameter. +# ppc64 RTAS watchdog +config WATCHDOG_RTAS + tristate "RTAS watchdog" + depends on WATCHDOG && PPC_RTAS + help + This driver adds watchdog support for the RTAS watchdog. + + To compile this driver as a module, choose M here. The module + will be called wdrtas. + # # ISA-based Watchdog Cards # diff --git a/drivers/char/watchdog/Makefile b/drivers/char/watchdog/Makefile index 1cd27efa35c1..c1838834ea7f 100644 --- a/drivers/char/watchdog/Makefile +++ b/drivers/char/watchdog/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_USBPCWATCHDOG) += pcwd_usb.o obj-$(CONFIG_IXP4XX_WATCHDOG) += ixp4xx_wdt.o obj-$(CONFIG_IXP2000_WATCHDOG) += ixp2000_wdt.o obj-$(CONFIG_8xx_WDT) += mpc8xx_wdt.o +obj-$(CONFIG_WATCHDOG_RTAS) += wdrtas.o # Only one watchdog can succeed. We probe the hardware watchdog # drivers first, then the softdog driver. This means if your hardware diff --git a/drivers/char/watchdog/wdrtas.c b/drivers/char/watchdog/wdrtas.c new file mode 100644 index 000000000000..619e2ffca33f --- /dev/null +++ b/drivers/char/watchdog/wdrtas.c @@ -0,0 +1,696 @@ +/* + * FIXME: add wdrtas_get_status and wdrtas_get_boot_status as soon as + * RTAS calls are available + */ + +/* + * RTAS watchdog driver + * + * (C) Copyright IBM Corp. 2005 + * device driver to exploit watchdog RTAS functions + * + * Authors : Utz Bacher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define WDRTAS_MAGIC_CHAR 42 +#define WDRTAS_SUPPORTED_MASK (WDIOF_SETTIMEOUT | \ + WDIOF_MAGICCLOSE) + +MODULE_AUTHOR("Utz Bacher "); +MODULE_DESCRIPTION("RTAS watchdog driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); +MODULE_ALIAS_MISCDEV(TEMP_MINOR); + +#ifdef CONFIG_WATCHDOG_NOWAYOUT +static int wdrtas_nowayout = 1; +#else +static int wdrtas_nowayout = 0; +#endif + +static atomic_t wdrtas_miscdev_open = ATOMIC_INIT(0); +static char wdrtas_expect_close = 0; + +static int wdrtas_interval; + +#define WDRTAS_THERMAL_SENSOR 3 +static int wdrtas_token_get_sensor_state; +#define WDRTAS_SURVEILLANCE_IND 9000 +static int wdrtas_token_set_indicator; +#define WDRTAS_SP_SPI 28 +static int wdrtas_token_get_sp; +static int wdrtas_token_event_scan; + +#define WDRTAS_DEFAULT_INTERVAL 300 + +#define WDRTAS_LOGBUFFER_LEN 128 +static char wdrtas_logbuffer[WDRTAS_LOGBUFFER_LEN]; + + +/*** watchdog access functions */ + +/** + * wdrtas_set_interval - sets the watchdog interval + * @interval: new interval + * + * returns 0 on success, <0 on failures + * + * wdrtas_set_interval sets the watchdog keepalive interval by calling the + * RTAS function set-indicator (surveillance). The unit of interval is + * seconds. + */ +static int +wdrtas_set_interval(int interval) +{ + long result; + static int print_msg = 10; + + /* rtas uses minutes */ + interval = (interval + 59) / 60; + + result = rtas_call(wdrtas_token_set_indicator, 3, 1, NULL, + WDRTAS_SURVEILLANCE_IND, 0, interval); + if ( (result < 0) && (print_msg) ) { + printk(KERN_ERR "wdrtas: setting the watchdog to %i " + "timeout failed: %li\n", interval, result); + print_msg--; + } + + return result; +} + +/** + * wdrtas_get_interval - returns the current watchdog interval + * @fallback_value: value (in seconds) to use, if the RTAS call fails + * + * returns the interval + * + * wdrtas_get_interval returns the current watchdog keepalive interval + * as reported by the RTAS function ibm,get-system-parameter. The unit + * of the return value is seconds. + */ +static int +wdrtas_get_interval(int fallback_value) +{ + long result; + char value[4]; + + result = rtas_call(wdrtas_token_get_sp, 3, 1, NULL, + WDRTAS_SP_SPI, (void *)__pa(&value), 4); + if ( (value[0] != 0) || (value[1] != 2) || (value[3] != 0) || + (result < 0) ) { + printk(KERN_WARNING "wdrtas: could not get sp_spi watchdog " + "timeout (%li). Continuing\n", result); + return fallback_value; + } + + /* rtas uses minutes */ + return ((int)value[2]) * 60; +} + +/** + * wdrtas_timer_start - starts watchdog + * + * wdrtas_timer_start starts the watchdog by calling the RTAS function + * set-interval (surveillance) + */ +static void +wdrtas_timer_start(void) +{ + wdrtas_set_interval(wdrtas_interval); +} + +/** + * wdrtas_timer_stop - stops watchdog + * + * wdrtas_timer_stop stops the watchdog timer by calling the RTAS function + * set-interval (surveillance) + */ +static void +wdrtas_timer_stop(void) +{ + wdrtas_set_interval(0); +} + +/** + * wdrtas_log_scanned_event - logs an event we received during keepalive + * + * wdrtas_log_scanned_event prints a message to the log buffer dumping + * the results of the last event-scan call + */ +static void +wdrtas_log_scanned_event(void) +{ + int i; + + for (i = 0; i < WDRTAS_LOGBUFFER_LEN; i += 16) + printk(KERN_INFO "wdrtas: dumping event (line %i/%i), data = " + "%02x %02x %02x %02x %02x %02x %02x %02x " + "%02x %02x %02x %02x %02x %02x %02x %02x\n", + (i / 16) + 1, (WDRTAS_LOGBUFFER_LEN / 16), + wdrtas_logbuffer[i + 0], wdrtas_logbuffer[i + 1], + wdrtas_logbuffer[i + 2], wdrtas_logbuffer[i + 3], + wdrtas_logbuffer[i + 4], wdrtas_logbuffer[i + 5], + wdrtas_logbuffer[i + 6], wdrtas_logbuffer[i + 7], + wdrtas_logbuffer[i + 8], wdrtas_logbuffer[i + 9], + wdrtas_logbuffer[i + 10], wdrtas_logbuffer[i + 11], + wdrtas_logbuffer[i + 12], wdrtas_logbuffer[i + 13], + wdrtas_logbuffer[i + 14], wdrtas_logbuffer[i + 15]); +} + +/** + * wdrtas_timer_keepalive - resets watchdog timer to keep system alive + * + * wdrtas_timer_keepalive restarts the watchdog timer by calling the + * RTAS function event-scan and repeats these calls as long as there are + * events available. All events will be dumped. + */ +static void +wdrtas_timer_keepalive(void) +{ + long result; + + do { + result = rtas_call(wdrtas_token_event_scan, 4, 1, NULL, + RTAS_EVENT_SCAN_ALL_EVENTS, 0, + (void *)__pa(wdrtas_logbuffer), + WDRTAS_LOGBUFFER_LEN); + if (result < 0) + printk(KERN_ERR "wdrtas: event-scan failed: %li\n", + result); + if (result == 0) + wdrtas_log_scanned_event(); + } while (result == 0); +} + +/** + * wdrtas_get_temperature - returns current temperature + * + * returns temperature or <0 on failures + * + * wdrtas_get_temperature returns the current temperature in Fahrenheit. It + * uses the RTAS call get-sensor-state, token 3 to do so + */ +static int +wdrtas_get_temperature(void) +{ + long result; + int temperature = 0; + + result = rtas_call(wdrtas_token_get_sensor_state, 2, 2, + (void *)__pa(&temperature), + WDRTAS_THERMAL_SENSOR, 0); + + if (result < 0) + printk(KERN_WARNING "wdrtas: reading the thermal sensor " + "faild: %li\n", result); + else + temperature = ((temperature * 9) / 5) + 32; /* fahrenheit */ + + return temperature; +} + +/** + * wdrtas_get_status - returns the status of the watchdog + * + * returns a bitmask of defines WDIOF_... as defined in + * include/linux/watchdog.h + */ +static int +wdrtas_get_status(void) +{ + return 0; /* TODO */ +} + +/** + * wdrtas_get_boot_status - returns the reason for the last boot + * + * returns a bitmask of defines WDIOF_... as defined in + * include/linux/watchdog.h, indicating why the watchdog rebooted the system + */ +static int +wdrtas_get_boot_status(void) +{ + return 0; /* TODO */ +} + +/*** watchdog API and operations stuff */ + +/* wdrtas_write - called when watchdog device is written to + * @file: file structure + * @buf: user buffer with data + * @len: amount to data written + * @ppos: position in file + * + * returns the number of successfully processed characters, which is always + * the number of bytes passed to this function + * + * wdrtas_write processes all the data given to it and looks for the magic + * character 'V'. This character allows the watchdog device to be closed + * properly. + */ +static ssize_t +wdrtas_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + int i; + char c; + + if (!len) + goto out; + + if (!wdrtas_nowayout) { + wdrtas_expect_close = 0; + /* look for 'V' */ + for (i = 0; i < len; i++) { + if (get_user(c, buf + i)) + return -EFAULT; + /* allow to close device */ + if (c == 'V') + wdrtas_expect_close = WDRTAS_MAGIC_CHAR; + } + } + + wdrtas_timer_keepalive(); + +out: + return len; +} + +/** + * wdrtas_ioctl - ioctl function for the watchdog device + * @inode: inode structure + * @file: file structure + * @cmd: command for ioctl + * @arg: argument pointer + * + * returns 0 on success, <0 on failure + * + * wdrtas_ioctl implements the watchdog API ioctls + */ +static int +wdrtas_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int __user *argp = (void *)arg; + int i; + static struct watchdog_info wdinfo = { + .options = WDRTAS_SUPPORTED_MASK, + .firmware_version = 0, + .identity = "wdrtas" + }; + + switch (cmd) { + case WDIOC_GETSUPPORT: + if (copy_to_user(argp, &wdinfo, sizeof(wdinfo))) + return -EFAULT; + return 0; + + case WDIOC_GETSTATUS: + i = wdrtas_get_status(); + return put_user(i, argp); + + case WDIOC_GETBOOTSTATUS: + i = wdrtas_get_boot_status(); + return put_user(i, argp); + + case WDIOC_GETTEMP: + if (wdrtas_token_get_sensor_state == RTAS_UNKNOWN_SERVICE) + return -EOPNOTSUPP; + + i = wdrtas_get_temperature(); + return put_user(i, argp); + + case WDIOC_SETOPTIONS: + if (get_user(i, argp)) + return -EFAULT; + if (i & WDIOS_DISABLECARD) + wdrtas_timer_stop(); + if (i & WDIOS_ENABLECARD) { + wdrtas_timer_keepalive(); + wdrtas_timer_start(); + } + if (i & WDIOS_TEMPPANIC) { + /* not implemented. Done by H8 */ + } + return 0; + + case WDIOC_KEEPALIVE: + wdrtas_timer_keepalive(); + return 0; + + case WDIOC_SETTIMEOUT: + if (get_user(i, argp)) + return -EFAULT; + + if (wdrtas_set_interval(i)) + return -EINVAL; + + wdrtas_timer_keepalive(); + + if (wdrtas_token_get_sp == RTAS_UNKNOWN_SERVICE) + wdrtas_interval = i; + else + wdrtas_interval = wdrtas_get_interval(i); + /* fallthrough */ + + case WDIOC_GETTIMEOUT: + return put_user(wdrtas_interval, argp); + + default: + return -ENOIOCTLCMD; + } +} + +/** + * wdrtas_open - open function of watchdog device + * @inode: inode structure + * @file: file structure + * + * returns 0 on success, -EBUSY if the file has been opened already, <0 on + * other failures + * + * function called when watchdog device is opened + */ +static int +wdrtas_open(struct inode *inode, struct file *file) +{ + /* only open once */ + if (atomic_inc_return(&wdrtas_miscdev_open) > 1) { + atomic_dec(&wdrtas_miscdev_open); + return -EBUSY; + } + + wdrtas_timer_start(); + wdrtas_timer_keepalive(); + + return nonseekable_open(inode, file); +} + +/** + * wdrtas_close - close function of watchdog device + * @inode: inode structure + * @file: file structure + * + * returns 0 on success + * + * close function. Always succeeds + */ +static int +wdrtas_close(struct inode *inode, struct file *file) +{ + /* only stop watchdog, if this was announced using 'V' before */ + if (wdrtas_expect_close == WDRTAS_MAGIC_CHAR) + wdrtas_timer_stop(); + else { + printk(KERN_WARNING "wdrtas: got unexpected close. Watchdog " + "not stopped.\n"); + wdrtas_timer_keepalive(); + } + + wdrtas_expect_close = 0; + atomic_dec(&wdrtas_miscdev_open); + return 0; +} + +/** + * wdrtas_temp_read - gives back the temperature in fahrenheit + * @file: file structure + * @buf: user buffer + * @count: number of bytes to be read + * @ppos: position in file + * + * returns always 1 or -EFAULT in case of user space copy failures, <0 on + * other failures + * + * wdrtas_temp_read gives the temperature to the users by copying this + * value as one byte into the user space buffer. The unit is Fahrenheit... + */ +static ssize_t +wdrtas_temp_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int temperature = 0; + + temperature = wdrtas_get_temperature(); + if (temperature < 0) + return temperature; + + if (copy_to_user(buf, &temperature, 1)) + return -EFAULT; + + return 1; +} + +/** + * wdrtas_temp_open - open function of temperature device + * @inode: inode structure + * @file: file structure + * + * returns 0 on success, <0 on failure + * + * function called when temperature device is opened + */ +static int +wdrtas_temp_open(struct inode *inode, struct file *file) +{ + return nonseekable_open(inode, file); +} + +/** + * wdrtas_temp_close - close function of temperature device + * @inode: inode structure + * @file: file structure + * + * returns 0 on success + * + * close function. Always succeeds + */ +static int +wdrtas_temp_close(struct inode *inode, struct file *file) +{ + return 0; +} + +/** + * wdrtas_reboot - reboot notifier function + * @nb: notifier block structure + * @code: reboot code + * @ptr: unused + * + * returns NOTIFY_DONE + * + * wdrtas_reboot stops the watchdog in case of a reboot + */ +static int +wdrtas_reboot(struct notifier_block *this, unsigned long code, void *ptr) +{ + if ( (code==SYS_DOWN) || (code==SYS_HALT) ) + wdrtas_timer_stop(); + + return NOTIFY_DONE; +} + +/*** initialization stuff */ + +static struct file_operations wdrtas_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .write = wdrtas_write, + .ioctl = wdrtas_ioctl, + .open = wdrtas_open, + .release = wdrtas_close, +}; + +static struct miscdevice wdrtas_miscdev = { + .minor = WATCHDOG_MINOR, + .name = "watchdog", + .fops = &wdrtas_fops, +}; + +static struct file_operations wdrtas_temp_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = wdrtas_temp_read, + .open = wdrtas_temp_open, + .release = wdrtas_temp_close, +}; + +static struct miscdevice wdrtas_tempdev = { + .minor = TEMP_MINOR, + .name = "temperature", + .fops = &wdrtas_temp_fops, +}; + +static struct notifier_block wdrtas_notifier = { + .notifier_call = wdrtas_reboot, +}; + +/** + * wdrtas_get_tokens - reads in RTAS tokens + * + * returns 0 on succes, <0 on failure + * + * wdrtas_get_tokens reads in the tokens for the RTAS calls used in + * this watchdog driver. It tolerates, if "get-sensor-state" and + * "ibm,get-system-parameter" are not available. + */ +static int +wdrtas_get_tokens(void) +{ + wdrtas_token_get_sensor_state = rtas_token("get-sensor-state"); + if (wdrtas_token_get_sensor_state == RTAS_UNKNOWN_SERVICE) { + printk(KERN_WARNING "wdrtas: couldn't get token for " + "get-sensor-state. Trying to continue without " + "temperature support.\n"); + } + + wdrtas_token_get_sp = rtas_token("ibm,get-system-parameter"); + if (wdrtas_token_get_sp == RTAS_UNKNOWN_SERVICE) { + printk(KERN_WARNING "wdrtas: couldn't get token for " + "ibm,get-system-parameter. Trying to continue with " + "a default timeout value of %i seconds.\n", + WDRTAS_DEFAULT_INTERVAL); + } + + wdrtas_token_set_indicator = rtas_token("set-indicator"); + if (wdrtas_token_set_indicator == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ERR "wdrtas: couldn't get token for " + "set-indicator. Terminating watchdog code.\n"); + return -EIO; + } + + wdrtas_token_event_scan = rtas_token("event-scan"); + if (wdrtas_token_event_scan == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ERR "wdrtas: couldn't get token for event-scan. " + "Terminating watchdog code.\n"); + return -EIO; + } + + return 0; +} + +/** + * wdrtas_unregister_devs - unregisters the misc dev handlers + * + * wdrtas_register_devs unregisters the watchdog and temperature watchdog + * misc devs + */ +static void +wdrtas_unregister_devs(void) +{ + misc_deregister(&wdrtas_miscdev); + if (wdrtas_token_get_sensor_state != RTAS_UNKNOWN_SERVICE) + misc_deregister(&wdrtas_tempdev); +} + +/** + * wdrtas_register_devs - registers the misc dev handlers + * + * returns 0 on succes, <0 on failure + * + * wdrtas_register_devs registers the watchdog and temperature watchdog + * misc devs + */ +static int +wdrtas_register_devs(void) +{ + int result; + + result = misc_register(&wdrtas_miscdev); + if (result) { + printk(KERN_ERR "wdrtas: couldn't register watchdog misc " + "device. Terminating watchdog code.\n"); + return result; + } + + if (wdrtas_token_get_sensor_state != RTAS_UNKNOWN_SERVICE) { + result = misc_register(&wdrtas_tempdev); + if (result) { + printk(KERN_WARNING "wdrtas: couldn't register " + "watchdog temperature misc device. Continuing " + "without temperature support.\n"); + wdrtas_token_get_sensor_state = RTAS_UNKNOWN_SERVICE; + } + } + + return 0; +} + +/** + * wdrtas_init - init function of the watchdog driver + * + * returns 0 on succes, <0 on failure + * + * registers the file handlers and the reboot notifier + */ +static int __init +wdrtas_init(void) +{ + if (wdrtas_get_tokens()) + return -ENODEV; + + if (wdrtas_register_devs()) + return -ENODEV; + + if (register_reboot_notifier(&wdrtas_notifier)) { + printk(KERN_ERR "wdrtas: could not register reboot notifier. " + "Terminating watchdog code.\n"); + wdrtas_unregister_devs(); + return -ENODEV; + } + + if (wdrtas_token_get_sp == RTAS_UNKNOWN_SERVICE) + wdrtas_interval = WDRTAS_DEFAULT_INTERVAL; + else + wdrtas_interval = wdrtas_get_interval(WDRTAS_DEFAULT_INTERVAL); + + return 0; +} + +/** + * wdrtas_exit - exit function of the watchdog driver + * + * unregisters the file handlers and the reboot notifier + */ +static void __exit +wdrtas_exit(void) +{ + if (!wdrtas_nowayout) + wdrtas_timer_stop(); + + wdrtas_unregister_devs(); + + unregister_reboot_notifier(&wdrtas_notifier); +} + +module_init(wdrtas_init); +module_exit(wdrtas_exit); From fef1c772fa154c16e0a54577e9ecb5480f7b937e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:37 +1000 Subject: [PATCH 015/199] [PATCH] ppc64: add BPA platform type This adds the basic support for running on BPA machines. So far, this is only the IBM workstation, and it will not run on others without a little more generalization. It should be possible to configure a kernel for any combination of CONFIG_PPC_BPA with any of the other multiplatform targets. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- MAINTAINERS | 7 ++ arch/ppc64/Kconfig | 6 +- arch/ppc64/Makefile | 2 + arch/ppc64/kernel/Makefile | 3 + arch/ppc64/kernel/bpa_setup.c | 135 +++++++++++++++++++++++++++ arch/ppc64/kernel/cpu_setup_power4.S | 16 +++- arch/ppc64/kernel/cputable.c | 11 +++ arch/ppc64/kernel/irq.c | 3 + arch/ppc64/kernel/proc_ppc64.c | 2 +- arch/ppc64/kernel/prom_init.c | 4 +- arch/ppc64/kernel/setup.c | 4 + arch/ppc64/kernel/traps.c | 4 + include/asm-ppc64/mmu.h | 5 +- include/asm-ppc64/processor.h | 15 ++- include/asm-ppc64/smp.h | 8 ++ 15 files changed, 216 insertions(+), 9 deletions(-) create mode 100644 arch/ppc64/kernel/bpa_setup.c diff --git a/MAINTAINERS b/MAINTAINERS index 4d44824884ef..5eaa6807cdb7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -504,6 +504,13 @@ L: bonding-devel@lists.sourceforge.net W: http://sourceforge.net/projects/bonding/ S: Supported +BROADBAND PROCESSOR ARCHITECTURE +P: Arnd Bergmann +M: arnd@arndb.de +L: linuxppc64-dev@ozlabs.org +W: http://linuxppc64.org +S: Supported + BTTV VIDEO4LINUX DRIVER P: Gerd Knorr M: kraxel@bytesex.org diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 0f1fa289744e..c7f2f0a4d856 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -77,6 +77,10 @@ config PPC_PSERIES bool " IBM pSeries & new iSeries" default y +config PPC_BPA + bool " Broadband Processor Architecture" + depends on PPC_MULTIPLATFORM + config PPC_PMAC depends on PPC_MULTIPLATFORM bool " Apple G5 based machines" @@ -256,7 +260,7 @@ config MSCHUNKS config PPC_RTAS bool - depends on PPC_PSERIES + depends on PPC_PSERIES || PPC_BPA default y config RTAS_PROC diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile index 33c752ceca4b..731b84758331 100644 --- a/arch/ppc64/Makefile +++ b/arch/ppc64/Makefile @@ -90,12 +90,14 @@ boot := arch/ppc64/boot boottarget-$(CONFIG_PPC_PSERIES) := zImage zImage.initrd boottarget-$(CONFIG_PPC_MAPLE) := zImage zImage.initrd boottarget-$(CONFIG_PPC_ISERIES) := vmlinux.sminitrd vmlinux.initrd vmlinux.sm +boottarget-$(CONFIG_PPC_BPA) := zImage zImage.initrd $(boottarget-y): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ bootimage-$(CONFIG_PPC_PSERIES) := $(boot)/zImage bootimage-$(CONFIG_PPC_PMAC) := vmlinux bootimage-$(CONFIG_PPC_MAPLE) := $(boot)/zImage +bootimage-$(CONFIG_PPC_BPA) := zImage bootimage-$(CONFIG_PPC_ISERIES) := vmlinux BOOTIMAGE := $(bootimage-y) install: vmlinux diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index f389f2453daa..c89983ab9098 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -33,6 +33,8 @@ obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \ pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \ xics.o pSeries_setup.o pSeries_iommu.o +obj-$(CONFIG_PPC_BPA) += bpa_setup.o bpa_nvram.o + obj-$(CONFIG_EEH) += eeh.o obj-$(CONFIG_PROC_FS) += proc_ppc64.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o @@ -59,6 +61,7 @@ ifdef CONFIG_SMP obj-$(CONFIG_PPC_PMAC) += pmac_smp.o smp-tbsync.o obj-$(CONFIG_PPC_ISERIES) += iSeries_smp.o obj-$(CONFIG_PPC_PSERIES) += pSeries_smp.o +obj-$(CONFIG_PPC_BPA) += pSeries_smp.o obj-$(CONFIG_PPC_MAPLE) += smp-tbsync.o endif diff --git a/arch/ppc64/kernel/bpa_setup.c b/arch/ppc64/kernel/bpa_setup.c new file mode 100644 index 000000000000..d1992dd2d61b --- /dev/null +++ b/arch/ppc64/kernel/bpa_setup.c @@ -0,0 +1,135 @@ +/* + * linux/arch/ppc/kernel/bpa_setup.c + * + * Copyright (C) 1995 Linus Torvalds + * Adapted from 'alpha' version by Gary Thomas + * Modified by Cort Dougan (cort@cs.nmt.edu) + * Modified by PPC64 Team, IBM Corp + * Modified by BPA Team, IBM Deutschland Entwicklung GmbH + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pci.h" + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +void bpa_get_cpuinfo(struct seq_file *m) +{ + struct device_node *root; + const char *model = ""; + + root = of_find_node_by_path("/"); + if (root) + model = get_property(root, "model", NULL); + seq_printf(m, "machine\t\t: BPA %s\n", model); + of_node_put(root); +} + +static void bpa_progress(char *s, unsigned short hex) +{ + printk("*** %04x : %s\n", hex, s ? s : ""); +} + +static void __init bpa_setup_arch(void) +{ +#ifdef CONFIG_SMP + smp_init_pSeries(); +#endif + + /* init to some ~sane value until calibrate_delay() runs */ + loops_per_jiffy = 50000000; + + if (ROOT_DEV == 0) { + printk("No ramdisk, default root is /dev/hda2\n"); + ROOT_DEV = Root_HDA2; + } + + /* Find and initialize PCI host bridges */ + init_pci_config_tokens(); + find_and_init_phbs(); + +#ifdef CONFIG_DUMMY_CONSOLE + conswitchp = &dummy_con; +#endif + + bpa_nvram_init(); +} + +/* + * Early initialization. Relocation is on but do not reference unbolted pages + */ +static void __init bpa_init_early(void) +{ + DBG(" -> bpa_init_early()\n"); + + hpte_init_native(); + + pci_direct_iommu_init(); + + ppc64_interrupt_controller = IC_BPA_IIC; + + DBG(" <- bpa_init_early()\n"); +} + + +static int __init bpa_probe(int platform) +{ + if (platform != PLATFORM_BPA) + return 0; + + return 1; +} + +struct machdep_calls __initdata bpa_md = { + .probe = bpa_probe, + .setup_arch = bpa_setup_arch, + .init_early = bpa_init_early, + .get_cpuinfo = bpa_get_cpuinfo, + .restart = rtas_restart, + .power_off = rtas_power_off, + .halt = rtas_halt, + .get_boot_time = rtas_get_boot_time, + .get_rtc_time = rtas_get_rtc_time, + .set_rtc_time = rtas_set_rtc_time, + .calibrate_decr = generic_calibrate_decr, + .progress = bpa_progress, +}; diff --git a/arch/ppc64/kernel/cpu_setup_power4.S b/arch/ppc64/kernel/cpu_setup_power4.S index 3bd951820850..42fc08cf87a0 100644 --- a/arch/ppc64/kernel/cpu_setup_power4.S +++ b/arch/ppc64/kernel/cpu_setup_power4.S @@ -73,7 +73,21 @@ _GLOBAL(__970_cpu_preinit) _GLOBAL(__setup_cpu_power4) blr - + +_GLOBAL(__setup_cpu_be) + /* Set large page sizes LP=0: 16MB, LP=1: 64KB */ + addi r3, 0, 0 + ori r3, r3, HID6_LB + sldi r3, r3, 32 + nor r3, r3, r3 + mfspr r4, SPRN_HID6 + and r4, r4, r3 + addi r3, 0, 0x02000 + sldi r3, r3, 32 + or r4, r4, r3 + mtspr SPRN_HID6, r4 + blr + _GLOBAL(__setup_cpu_ppc970) mfspr r0,SPRN_HID0 li r11,5 /* clear DOZE and SLEEP */ diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 8644a8648058..1d162c7c59df 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -34,6 +34,7 @@ EXPORT_SYMBOL(cur_cpu_spec); extern void __setup_cpu_power3(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_power4(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_be(unsigned long offset, struct cpu_spec* spec); /* We only set the altivec features if the kernel was compiled with altivec @@ -162,6 +163,16 @@ struct cpu_spec cpu_specs[] = { __setup_cpu_power4, COMMON_PPC64_FW }, + { /* BE DD1.x */ + 0xffff0000, 0x00700000, "Broadband Engine", + CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | + CPU_FTR_SMT, + COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP, + 128, 128, + __setup_cpu_be, + COMMON_PPC64_FW + }, { /* default match */ 0x00000000, 0x00000000, "POWER4 (compatible)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c index d860467b8f09..3defc8c33adf 100644 --- a/arch/ppc64/kernel/irq.c +++ b/arch/ppc64/kernel/irq.c @@ -395,6 +395,9 @@ int virt_irq_create_mapping(unsigned int real_irq) if (ppc64_interrupt_controller == IC_OPEN_PIC) return real_irq; /* no mapping for openpic (for now) */ + if (ppc64_interrupt_controller == IC_BPA_IIC) + return real_irq; /* no mapping for iic either */ + /* don't map interrupts < MIN_VIRT_IRQ */ if (real_irq < MIN_VIRT_IRQ) { virt_irq_to_real_map[real_irq] = real_irq; diff --git a/arch/ppc64/kernel/proc_ppc64.c b/arch/ppc64/kernel/proc_ppc64.c index 0914b0669b05..a87c66a9652a 100644 --- a/arch/ppc64/kernel/proc_ppc64.c +++ b/arch/ppc64/kernel/proc_ppc64.c @@ -53,7 +53,7 @@ static int __init proc_ppc64_create(void) if (!root) return 1; - if (!(systemcfg->platform & PLATFORM_PSERIES)) + if (!(systemcfg->platform & (PLATFORM_PSERIES | PLATFORM_BPA))) return 0; if (!proc_mkdir("rtas", root)) diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c index b7683abfbe6a..e248a7950aeb 100644 --- a/arch/ppc64/kernel/prom_init.c +++ b/arch/ppc64/kernel/prom_init.c @@ -1915,9 +1915,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long prom_send_capabilities(); /* - * On pSeries, copy the CPU hold code + * On pSeries and BPA, copy the CPU hold code */ - if (RELOC(of_platform) & PLATFORM_PSERIES) + if (RELOC(of_platform) & (PLATFORM_PSERIES | PLATFORM_BPA)) copy_and_flush(0, KERNELBASE - offset, 0x100, 0); /* diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 93b0ee88cda1..10222008fe20 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -344,6 +344,7 @@ static void __init setup_cpu_maps(void) extern struct machdep_calls pSeries_md; extern struct machdep_calls pmac_md; extern struct machdep_calls maple_md; +extern struct machdep_calls bpa_md; /* Ultimately, stuff them in an elf section like initcalls... */ static struct machdep_calls __initdata *machines[] = { @@ -356,6 +357,9 @@ static struct machdep_calls __initdata *machines[] = { #ifdef CONFIG_PPC_MAPLE &maple_md, #endif /* CONFIG_PPC_MAPLE */ +#ifdef CONFIG_PPC_BPA + &bpa_md, +#endif NULL }; diff --git a/arch/ppc64/kernel/traps.c b/arch/ppc64/kernel/traps.c index 7e52cb2605e0..a8d5e83ee89f 100644 --- a/arch/ppc64/kernel/traps.c +++ b/arch/ppc64/kernel/traps.c @@ -126,6 +126,10 @@ int die(const char *str, struct pt_regs *regs, long err) printk("POWERMAC "); nl = 1; break; + case PLATFORM_BPA: + printk("BPA "); + nl = 1; + break; } if (nl) printk("\n"); diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index c78282a67d8e..9d03a98a4fa3 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -47,9 +47,10 @@ #define SLB_VSID_KS ASM_CONST(0x0000000000000800) #define SLB_VSID_KP ASM_CONST(0x0000000000000400) #define SLB_VSID_N ASM_CONST(0x0000000000000200) /* no-execute */ -#define SLB_VSID_L ASM_CONST(0x0000000000000100) /* largepage 16M */ +#define SLB_VSID_L ASM_CONST(0x0000000000000100) /* largepage */ #define SLB_VSID_C ASM_CONST(0x0000000000000080) /* class */ - +#define SLB_VSID_LS ASM_CONST(0x0000000000000070) /* size of largepage */ + #define SLB_VSID_KERNEL (SLB_VSID_KP|SLB_VSID_C) #define SLB_VSID_USER (SLB_VSID_KP|SLB_VSID_KS) diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index 3084099086a8..af28aa55d8c1 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -138,8 +138,16 @@ #define SPRN_NIADORM 0x3F3 /* Hardware Implementation Register 2 */ #define SPRN_HID4 0x3F4 /* 970 HID4 */ #define SPRN_HID5 0x3F6 /* 970 HID5 */ -#define SPRN_TSC 0x3FD /* Thread switch control */ -#define SPRN_TST 0x3FC /* Thread switch timeout */ +#define SPRN_HID6 0x3F9 /* BE HID 6 */ +#define HID6_LB (0x0F<<12) /* Concurrent Large Page Modes */ +#define HID6_DLP (1<<20) /* Disable all large page modes (4K only) */ +#define SPRN_TSCR 0x399 /* Thread switch control on BE */ +#define SPRN_TTR 0x39A /* Thread switch timeout on BE */ +#define TSCR_DEC_ENABLE 0x200000 /* Decrementer Interrupt */ +#define TSCR_EE_ENABLE 0x100000 /* External Interrupt */ +#define TSCR_EE_BOOST 0x080000 /* External Interrupt Boost */ +#define SPRN_TSC 0x3FD /* Thread switch control on others */ +#define SPRN_TST 0x3FC /* Thread switch timeout on others */ #define SPRN_L2CR 0x3F9 /* Level 2 Cache Control Regsiter */ #define SPRN_LR 0x008 /* Link Register */ #define SPRN_PIR 0x3FF /* Processor Identification Register */ @@ -259,6 +267,7 @@ #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 +#define PV_BE 0x0070 /* Platforms supported by PPC64 */ #define PLATFORM_PSERIES 0x0100 @@ -267,6 +276,7 @@ #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_MAPLE 0x0500 +#define PLATFORM_BPA 0x1000 /* Compatibility with drivers coming from PPC32 world */ #define _machine (systemcfg->platform) @@ -278,6 +288,7 @@ #define IC_INVALID 0 #define IC_OPEN_PIC 1 #define IC_PPC_XIC 2 +#define IC_BPA_IIC 3 #define XGLUE(a,b) a##b #define GLUE(a,b) XGLUE(a,b) diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h index 8115ecb8feee..d86f742e9a21 100644 --- a/include/asm-ppc64/smp.h +++ b/include/asm-ppc64/smp.h @@ -85,6 +85,14 @@ extern void smp_generic_take_timebase(void); extern struct smp_ops_t *smp_ops; +#ifdef CONFIG_PPC_PSERIES +void vpa_init(int cpu); +#else +static inline void vpa_init(int cpu) +{ +} +#endif /* CONFIG_PPC_PSERIES */ + #endif /* __ASSEMBLY__ */ #endif /* !(_PPC64_SMP_H) */ From cebf589c822b5de87098b57644024d16f8dbc1bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:43 +1000 Subject: [PATCH 016/199] [PATCH] ppc64: Add driver for BPA interrupt controllers Add support for the integrated interrupt controller on BPA CPUs. There is one of those for each SMT thread. The mapping of interrupt numbers to HW interrupt sources is described in arch/ppc64/kernel/bpa_iic.h. This version hardcodes the 'Spider' chip as the secondary interrupt controller. That is not really generic for the architecture, but at the moment it is the only secondary PIC that exists. A little more work will be needed on this as soon as we have boards with multiple external interrupt controllers. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/Kconfig | 15 ++ arch/ppc64/kernel/Makefile | 8 +- arch/ppc64/kernel/bpa_iic.c | 270 ++++++++++++++++++++++++++++++++ arch/ppc64/kernel/bpa_iic.h | 62 ++++++++ arch/ppc64/kernel/bpa_setup.c | 6 +- arch/ppc64/kernel/pSeries_smp.c | 69 +++++++- arch/ppc64/kernel/smp.c | 4 +- arch/ppc64/kernel/spider-pic.c | 191 ++++++++++++++++++++++ 8 files changed, 614 insertions(+), 11 deletions(-) create mode 100644 arch/ppc64/kernel/bpa_iic.c create mode 100644 arch/ppc64/kernel/bpa_iic.h create mode 100644 arch/ppc64/kernel/spider-pic.c diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index c7f2f0a4d856..bae56ec76ea7 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -110,6 +110,21 @@ config PPC_OF bool default y +config XICS + depends on PPC_PSERIES + bool + default y + +config MPIC + depends on PPC_PSERIES || PPC_PMAC || PPC_MAPLE + bool + default y + +config BPA_IIC + depends on PPC_BPA + bool + default y + # VMX is pSeries only for now until somebody writes the iSeries # exception vectors for it config ALTIVEC diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index c89983ab9098..bc58d80ee3d0 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -27,13 +27,13 @@ obj-$(CONFIG_PPC_ISERIES) += HvCall.o HvLpConfig.o LparData.o \ mf.o HvLpEvent.o iSeries_proc.o iSeries_htab.o \ iSeries_iommu.o -obj-$(CONFIG_PPC_MULTIPLATFORM) += nvram.o i8259.o prom_init.o prom.o mpic.o +obj-$(CONFIG_PPC_MULTIPLATFORM) += nvram.o i8259.o prom_init.o prom.o obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \ pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \ - xics.o pSeries_setup.o pSeries_iommu.o + pSeries_setup.o pSeries_iommu.o -obj-$(CONFIG_PPC_BPA) += bpa_setup.o bpa_nvram.o +obj-$(CONFIG_PPC_BPA) += bpa_setup.o bpa_nvram.o bpa_iic.o spider-pic.o obj-$(CONFIG_EEH) += eeh.o obj-$(CONFIG_PROC_FS) += proc_ppc64.o @@ -49,6 +49,8 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_IBMVIO) += vio.o +obj-$(CONFIG_XICS) += xics.o +obj-$(CONFIG_MPIC) += mpic.o obj-$(CONFIG_PPC_PMAC) += pmac_setup.o pmac_feature.o pmac_pci.o \ pmac_time.o pmac_nvram.o pmac_low_i2c.o diff --git a/arch/ppc64/kernel/bpa_iic.c b/arch/ppc64/kernel/bpa_iic.c new file mode 100644 index 000000000000..c8f3dc3fad70 --- /dev/null +++ b/arch/ppc64/kernel/bpa_iic.c @@ -0,0 +1,270 @@ +/* + * BPA Internal Interrupt Controller + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005 + * + * Author: Arnd Bergmann + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bpa_iic.h" + +struct iic_pending_bits { + u32 data; + u8 flags; + u8 class; + u8 source; + u8 prio; +}; + +enum iic_pending_flags { + IIC_VALID = 0x80, + IIC_IPI = 0x40, +}; + +struct iic_regs { + struct iic_pending_bits pending; + struct iic_pending_bits pending_destr; + u64 generate; + u64 prio; +}; + +struct iic { + struct iic_regs __iomem *regs; +}; + +static DEFINE_PER_CPU(struct iic, iic); + +void iic_local_enable(void) +{ + out_be64(&__get_cpu_var(iic).regs->prio, 0xff); +} + +void iic_local_disable(void) +{ + out_be64(&__get_cpu_var(iic).regs->prio, 0x0); +} + +static unsigned int iic_startup(unsigned int irq) +{ + return 0; +} + +static void iic_enable(unsigned int irq) +{ + iic_local_enable(); +} + +static void iic_disable(unsigned int irq) +{ +} + +static void iic_end(unsigned int irq) +{ + iic_local_enable(); +} + +static struct hw_interrupt_type iic_pic = { + .typename = " BPA-IIC ", + .startup = iic_startup, + .enable = iic_enable, + .disable = iic_disable, + .end = iic_end, +}; + +static int iic_external_get_irq(struct iic_pending_bits pending) +{ + int irq; + unsigned char node, unit; + + node = pending.source >> 4; + unit = pending.source & 0xf; + irq = -1; + + /* + * This mapping is specific to the Broadband + * Engine. We might need to get the numbers + * from the device tree to support future CPUs. + */ + switch (unit) { + case 0x00: + case 0x0b: + /* + * One of these units can be connected + * to an external interrupt controller. + */ + if (pending.prio > 0x3f || + pending.class != 2) + break; + irq = IIC_EXT_OFFSET + + spider_get_irq(pending.prio + node * IIC_NODE_STRIDE) + + node * IIC_NODE_STRIDE; + break; + case 0x01 ... 0x04: + case 0x07 ... 0x0a: + /* + * These units are connected to the SPEs + */ + if (pending.class > 2) + break; + irq = IIC_SPE_OFFSET + + pending.class * IIC_CLASS_STRIDE + + node * IIC_NODE_STRIDE + + unit; + break; + } + if (irq == -1) + printk(KERN_WARNING "Unexpected interrupt class %02x, " + "source %02x, prio %02x, cpu %02x\n", pending.class, + pending.source, pending.prio, smp_processor_id()); + return irq; +} + +/* Get an IRQ number from the pending state register of the IIC */ +int iic_get_irq(struct pt_regs *regs) +{ + struct iic *iic; + int irq; + struct iic_pending_bits pending; + + iic = &__get_cpu_var(iic); + *(unsigned long *) &pending = + in_be64((unsigned long __iomem *) &iic->regs->pending_destr); + + irq = -1; + if (pending.flags & IIC_VALID) { + if (pending.flags & IIC_IPI) { + irq = IIC_IPI_OFFSET + (pending.prio >> 4); +/* + if (irq > 0x80) + printk(KERN_WARNING "Unexpected IPI prio %02x" + "on CPU %02x\n", pending.prio, + smp_processor_id()); +*/ + } else { + irq = iic_external_get_irq(pending); + } + } + return irq; +} + +static struct iic_regs __iomem *find_iic(int cpu) +{ + struct device_node *np; + int nodeid = cpu / 2; + unsigned long regs; + struct iic_regs __iomem *iic_regs; + + for (np = of_find_node_by_type(NULL, "cpu"); + np; + np = of_find_node_by_type(np, "cpu")) { + if (nodeid == *(int *)get_property(np, "node-id", NULL)) + break; + } + + if (!np) { + printk(KERN_WARNING "IIC: CPU %d not found\n", cpu); + iic_regs = NULL; + } else { + regs = *(long *)get_property(np, "iic", NULL); + + /* hack until we have decided on the devtree info */ + regs += 0x400; + if (cpu & 1) + regs += 0x20; + + printk(KERN_DEBUG "IIC for CPU %d at %lx\n", cpu, regs); + iic_regs = __ioremap(regs, sizeof(struct iic_regs), + _PAGE_NO_CACHE); + } + return iic_regs; +} + +#ifdef CONFIG_SMP +void iic_setup_cpu(void) +{ + out_be64(&__get_cpu_var(iic).regs->prio, 0xff); +} + +void iic_cause_IPI(int cpu, int mesg) +{ + out_be64(&per_cpu(iic, cpu).regs->generate, mesg); +} + +static irqreturn_t iic_ipi_action(int irq, void *dev_id, struct pt_regs *regs) +{ + + smp_message_recv(irq - IIC_IPI_OFFSET, regs); + return IRQ_HANDLED; +} + +static void iic_request_ipi(int irq, const char *name) +{ + /* IPIs are marked SA_INTERRUPT as they must run with irqs + * disabled */ + get_irq_desc(irq)->handler = &iic_pic; + get_irq_desc(irq)->status |= IRQ_PER_CPU; + request_irq(irq, iic_ipi_action, SA_INTERRUPT, name, NULL); +} + +void iic_request_IPIs(void) +{ + iic_request_ipi(IIC_IPI_OFFSET + PPC_MSG_CALL_FUNCTION, "IPI-call"); + iic_request_ipi(IIC_IPI_OFFSET + PPC_MSG_RESCHEDULE, "IPI-resched"); +#ifdef CONFIG_DEBUGGER + iic_request_ipi(IIC_IPI_OFFSET + PPC_MSG_DEBUGGER_BREAK, "IPI-debug"); +#endif /* CONFIG_DEBUGGER */ +} +#endif /* CONFIG_SMP */ + +static void iic_setup_spe_handlers(void) +{ + int be, isrc; + + /* Assume two threads per BE are present */ + for (be=0; be < num_present_cpus() / 2; be++) { + for (isrc = 0; isrc < IIC_CLASS_STRIDE * 3; isrc++) { + int irq = IIC_NODE_STRIDE * be + IIC_SPE_OFFSET + isrc; + get_irq_desc(irq)->handler = &iic_pic; + } + } +} + +void iic_init_IRQ(void) +{ + int cpu, irq_offset; + struct iic *iic; + + irq_offset = 0; + for_each_cpu(cpu) { + iic = &per_cpu(iic, cpu); + iic->regs = find_iic(cpu); + if (iic->regs) + out_be64(&iic->regs->prio, 0xff); + } + iic_setup_spe_handlers(); +} diff --git a/arch/ppc64/kernel/bpa_iic.h b/arch/ppc64/kernel/bpa_iic.h new file mode 100644 index 000000000000..6833c3022166 --- /dev/null +++ b/arch/ppc64/kernel/bpa_iic.h @@ -0,0 +1,62 @@ +#ifndef ASM_BPA_IIC_H +#define ASM_BPA_IIC_H +#ifdef __KERNEL__ +/* + * Mapping of IIC pending bits into per-node + * interrupt numbers. + * + * IRQ FF CC SS PP FF CC SS PP Description + * + * 00-3f 80 02 +0 00 - 80 02 +0 3f South Bridge + * 00-3f 80 02 +b 00 - 80 02 +b 3f South Bridge + * 41-4a 80 00 +1 ** - 80 00 +a ** SPU Class 0 + * 51-5a 80 01 +1 ** - 80 01 +a ** SPU Class 1 + * 61-6a 80 02 +1 ** - 80 02 +a ** SPU Class 2 + * 70-7f C0 ** ** 00 - C0 ** ** 0f IPI + * + * F flags + * C class + * S source + * P Priority + * + node number + * * don't care + * + * A node consists of a Broadband Engine and an optional + * south bridge device providing a maximum of 64 IRQs. + * The south bridge may be connected to either IOIF0 + * or IOIF1. + * Each SPE is represented as three IRQ lines, one per + * interrupt class. + * 16 IRQ numbers are reserved for inter processor + * interruptions, although these are only used in the + * range of the first node. + * + * This scheme needs 128 IRQ numbers per BIF node ID, + * which means that with the total of 512 lines + * available, we can have a maximum of four nodes. + */ + +enum { + IIC_EXT_OFFSET = 0x00, /* Start of south bridge IRQs */ + IIC_NUM_EXT = 0x40, /* Number of south bridge IRQs */ + IIC_SPE_OFFSET = 0x40, /* Start of SPE interrupts */ + IIC_CLASS_STRIDE = 0x10, /* SPE IRQs per class */ + IIC_IPI_OFFSET = 0x70, /* Start of IPI IRQs */ + IIC_NUM_IPIS = 0x10, /* IRQs reserved for IPI */ + IIC_NODE_STRIDE = 0x80, /* Total IRQs per node */ +}; + +extern void iic_init_IRQ(void); +extern int iic_get_irq(struct pt_regs *regs); +extern void iic_cause_IPI(int cpu, int mesg); +extern void iic_request_IPIs(void); +extern void iic_setup_cpu(void); +extern void iic_local_enable(void); +extern void iic_local_disable(void); + + +extern void spider_init_IRQ(void); +extern int spider_get_irq(unsigned long int_pending); + +#endif +#endif /* ASM_BPA_IIC_H */ diff --git a/arch/ppc64/kernel/bpa_setup.c b/arch/ppc64/kernel/bpa_setup.c index d1992dd2d61b..0a43d8a93d76 100644 --- a/arch/ppc64/kernel/bpa_setup.c +++ b/arch/ppc64/kernel/bpa_setup.c @@ -45,6 +45,7 @@ #include #include "pci.h" +#include "bpa_iic.h" #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -71,6 +72,9 @@ static void bpa_progress(char *s, unsigned short hex) static void __init bpa_setup_arch(void) { + ppc_md.init_IRQ = iic_init_IRQ; + ppc_md.get_irq = iic_get_irq; + #ifdef CONFIG_SMP smp_init_pSeries(); #endif @@ -86,7 +90,7 @@ static void __init bpa_setup_arch(void) /* Find and initialize PCI host bridges */ init_pci_config_tokens(); find_and_init_phbs(); - + spider_init_IRQ(); #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c index 4203bd020c82..30154140f7e2 100644 --- a/arch/ppc64/kernel/pSeries_smp.c +++ b/arch/ppc64/kernel/pSeries_smp.c @@ -1,5 +1,5 @@ /* - * SMP support for pSeries machines. + * SMP support for pSeries and BPA machines. * * Dave Engebretsen, Peter Bergner, and * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com @@ -47,6 +47,7 @@ #include #include "mpic.h" +#include "bpa_iic.h" #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -286,6 +287,7 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) return 1; } +#ifdef CONFIG_XICS static inline void smp_xics_do_message(int cpu, int msg) { set_bit(msg, &xics_ipi_message[cpu].value); @@ -327,6 +329,37 @@ static void __devinit smp_xics_setup_cpu(int cpu) cpu_clear(cpu, of_spin_map); } +#endif /* CONFIG_XICS */ +#ifdef CONFIG_BPA_IIC +static void smp_iic_message_pass(int target, int msg) +{ + unsigned int i; + + if (target < NR_CPUS) { + iic_cause_IPI(target, msg); + } else { + for_each_online_cpu(i) { + if (target == MSG_ALL_BUT_SELF + && i == smp_processor_id()) + continue; + iic_cause_IPI(i, msg); + } + } +} + +static int __init smp_iic_probe(void) +{ + iic_request_IPIs(); + + return cpus_weight(cpu_possible_map); +} + +static void __devinit smp_iic_setup_cpu(int cpu) +{ + if (cpu != boot_cpuid) + iic_setup_cpu(); +} +#endif /* CONFIG_BPA_IIC */ static DEFINE_SPINLOCK(timebase_lock); static unsigned long timebase = 0; @@ -381,14 +414,15 @@ static int smp_pSeries_cpu_bootable(unsigned int nr) return 1; } - +#ifdef CONFIG_MPIC static struct smp_ops_t pSeries_mpic_smp_ops = { .message_pass = smp_mpic_message_pass, .probe = smp_mpic_probe, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_mpic_setup_cpu, }; - +#endif +#ifdef CONFIG_XICS static struct smp_ops_t pSeries_xics_smp_ops = { .message_pass = smp_xics_message_pass, .probe = smp_xics_probe, @@ -396,6 +430,16 @@ static struct smp_ops_t pSeries_xics_smp_ops = { .setup_cpu = smp_xics_setup_cpu, .cpu_bootable = smp_pSeries_cpu_bootable, }; +#endif +#ifdef CONFIG_BPA_IIC +static struct smp_ops_t bpa_iic_smp_ops = { + .message_pass = smp_iic_message_pass, + .probe = smp_iic_probe, + .kick_cpu = smp_pSeries_kick_cpu, + .setup_cpu = smp_iic_setup_cpu, + .cpu_bootable = smp_pSeries_cpu_bootable, +}; +#endif /* This is called very early */ void __init smp_init_pSeries(void) @@ -404,10 +448,25 @@ void __init smp_init_pSeries(void) DBG(" -> smp_init_pSeries()\n"); - if (ppc64_interrupt_controller == IC_OPEN_PIC) + switch (ppc64_interrupt_controller) { +#ifdef CONFIG_MPIC + case IC_OPEN_PIC: smp_ops = &pSeries_mpic_smp_ops; - else + break; +#endif +#ifdef CONFIG_XICS + case IC_PPC_XIC: smp_ops = &pSeries_xics_smp_ops; + break; +#endif +#ifdef CONFIG_BPA_IIC + case IC_BPA_IIC: + smp_ops = &bpa_iic_smp_ops; + break; +#endif + default: + panic("Invalid interrupt controller"); + } #ifdef CONFIG_HOTPLUG_CPU smp_ops->cpu_disable = pSeries_cpu_disable; diff --git a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c index 9ef5d36d6b25..2fcddfcb594d 100644 --- a/arch/ppc64/kernel/smp.c +++ b/arch/ppc64/kernel/smp.c @@ -71,7 +71,7 @@ void smp_call_function_interrupt(void); int smt_enabled_at_boot = 1; -#ifdef CONFIG_PPC_MULTIPLATFORM +#ifdef CONFIG_MPIC void smp_mpic_message_pass(int target, int msg) { /* make sure we're sending something that translates to an IPI */ @@ -128,7 +128,7 @@ void __devinit smp_generic_kick_cpu(int nr) smp_mb(); } -#endif /* CONFIG_PPC_MULTIPLATFORM */ +#endif /* CONFIG_MPIC */ static void __init smp_space_timers(unsigned int max_cpus) { diff --git a/arch/ppc64/kernel/spider-pic.c b/arch/ppc64/kernel/spider-pic.c new file mode 100644 index 000000000000..d5c9a02fb119 --- /dev/null +++ b/arch/ppc64/kernel/spider-pic.c @@ -0,0 +1,191 @@ +/* + * External Interrupt Controller on Spider South Bridge + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005 + * + * Author: Arnd Bergmann + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include + +#include +#include +#include + +#include "bpa_iic.h" + +/* register layout taken from Spider spec, table 7.4-4 */ +enum { + TIR_DEN = 0x004, /* Detection Enable Register */ + TIR_MSK = 0x084, /* Mask Level Register */ + TIR_EDC = 0x0c0, /* Edge Detection Clear Register */ + TIR_PNDA = 0x100, /* Pending Register A */ + TIR_PNDB = 0x104, /* Pending Register B */ + TIR_CS = 0x144, /* Current Status Register */ + TIR_LCSA = 0x150, /* Level Current Status Register A */ + TIR_LCSB = 0x154, /* Level Current Status Register B */ + TIR_LCSC = 0x158, /* Level Current Status Register C */ + TIR_LCSD = 0x15c, /* Level Current Status Register D */ + TIR_CFGA = 0x200, /* Setting Register A0 */ + TIR_CFGB = 0x204, /* Setting Register B0 */ + /* 0x208 ... 0x3ff Setting Register An/Bn */ + TIR_PPNDA = 0x400, /* Packet Pending Register A */ + TIR_PPNDB = 0x404, /* Packet Pending Register B */ + TIR_PIERA = 0x408, /* Packet Output Error Register A */ + TIR_PIERB = 0x40c, /* Packet Output Error Register B */ + TIR_PIEN = 0x444, /* Packet Output Enable Register */ + TIR_PIPND = 0x454, /* Packet Output Pending Register */ + TIRDID = 0x484, /* Spider Device ID Register */ + REISTIM = 0x500, /* Reissue Command Timeout Time Setting */ + REISTIMEN = 0x504, /* Reissue Command Timeout Setting */ + REISWAITEN = 0x508, /* Reissue Wait Control*/ +}; + +static void __iomem *spider_pics[4]; + +static void __iomem *spider_get_pic(int irq) +{ + int node = irq / IIC_NODE_STRIDE; + irq %= IIC_NODE_STRIDE; + + if (irq >= IIC_EXT_OFFSET && + irq < IIC_EXT_OFFSET + IIC_NUM_EXT && + spider_pics) + return spider_pics[node]; + return NULL; +} + +static int spider_get_nr(unsigned int irq) +{ + return (irq % IIC_NODE_STRIDE) - IIC_EXT_OFFSET; +} + +static void __iomem *spider_get_irq_config(int irq) +{ + void __iomem *pic; + pic = spider_get_pic(irq); + return pic + TIR_CFGA + 8 * spider_get_nr(irq); +} + +static void spider_enable_irq(unsigned int irq) +{ + void __iomem *cfg = spider_get_irq_config(irq); + irq = spider_get_nr(irq); + + out_be32(cfg, in_be32(cfg) | 0x3107000eu); + out_be32(cfg + 4, in_be32(cfg + 4) | 0x00020000u | irq); +} + +static void spider_disable_irq(unsigned int irq) +{ + void __iomem *cfg = spider_get_irq_config(irq); + irq = spider_get_nr(irq); + + out_be32(cfg, in_be32(cfg) & ~0x30000000u); +} + +static unsigned int spider_startup_irq(unsigned int irq) +{ + spider_enable_irq(irq); + return 0; +} + +static void spider_shutdown_irq(unsigned int irq) +{ + spider_disable_irq(irq); +} + +static void spider_end_irq(unsigned int irq) +{ + spider_enable_irq(irq); +} + +static void spider_ack_irq(unsigned int irq) +{ + spider_disable_irq(irq); + iic_local_enable(); +} + +static struct hw_interrupt_type spider_pic = { + .typename = " SPIDER ", + .startup = spider_startup_irq, + .shutdown = spider_shutdown_irq, + .enable = spider_enable_irq, + .disable = spider_disable_irq, + .ack = spider_ack_irq, + .end = spider_end_irq, +}; + + +int spider_get_irq(unsigned long int_pending) +{ + void __iomem *regs = spider_get_pic(int_pending); + unsigned long cs; + int irq; + + cs = in_be32(regs + TIR_CS); + + irq = cs >> 24; + if (irq != 63) + return irq; + + return -1; +} + +void spider_init_IRQ(void) +{ + int node; + struct device_node *dn; + unsigned int *property; + long spiderpic; + int n; + +/* FIXME: detect multiple PICs as soon as the device tree has them */ + for (node = 0; node < 1; node++) { + dn = of_find_node_by_path("/"); + n = prom_n_addr_cells(dn); + property = (unsigned int *) get_property(dn, + "platform-spider-pic", NULL); + + if (!property) + continue; + for (spiderpic = 0; n > 0; --n) + spiderpic = (spiderpic << 32) + *property++; + printk(KERN_DEBUG "SPIDER addr: %lx\n", spiderpic); + spider_pics[node] = __ioremap(spiderpic, 0x800, _PAGE_NO_CACHE); + for (n = 0; n < IIC_NUM_EXT; n++) { + int irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE; + get_irq_desc(irq)->handler = &spider_pic; + + /* do not mask any interrupts because of level */ + out_be32(spider_pics[node] + TIR_MSK, 0x0); + + /* disable edge detection clear */ + /* out_be32(spider_pics[node] + TIR_EDC, 0x0); */ + + /* enable interrupt packets to be output */ + out_be32(spider_pics[node] + TIR_PIEN, + in_be32(spider_pics[node] + TIR_PIEN) | 0x1); + + /* Enable the interrupt detection enable bit. Do this last! */ + out_be32(spider_pics[node] + TIR_DEN, + in_be32(spider_pics[node] +TIR_DEN) | 0x1); + + } + } +} From ae209cf10086b97e92e39af7cec0f84b21b6fca3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:54 +1000 Subject: [PATCH 017/199] [PATCH] ppc64: Add driver for BPA iommu Implementation of software load support for the BE iommu. This is very different from other iommu code on ppc64, since we only do a static mapping. The mapping is currently hardcoded but should really be read from the firmware, but they don't set up the device nodes yet. There is a single 512MB DMA window for PCI, USB and ethernet at 0x20000000 for our RAM. The Cell processor can put the I/O page table either in memory like the hashed page table (hardware load) or have the operating system write the entries into memory mapped CPU registers (software load). I use the software load mechanism because I know that all I/O page table entries for the amount of installed physical memory fit into the IO TLB cache. At the point when we get machines with more than 4GB of installed memory, we can either use hardware I/O page table access like the other platforms do or dynamically update the I/O TLB entries when a page fault occurs in the I/O subsystem. The software load can then use the macros that I have implemented for the static mapping in order to do the TLB cache updates. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/Makefile | 3 +- arch/ppc64/kernel/bpa_iommu.c | 377 ++++++++++++++++++++++++++++++++++ arch/ppc64/kernel/bpa_iommu.h | 65 ++++++ arch/ppc64/kernel/bpa_setup.c | 3 +- 4 files changed, 446 insertions(+), 2 deletions(-) create mode 100644 arch/ppc64/kernel/bpa_iommu.c create mode 100644 arch/ppc64/kernel/bpa_iommu.h diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index bc58d80ee3d0..dffbfb7ac8d5 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -33,7 +33,8 @@ obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \ pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \ pSeries_setup.o pSeries_iommu.o -obj-$(CONFIG_PPC_BPA) += bpa_setup.o bpa_nvram.o bpa_iic.o spider-pic.o +obj-$(CONFIG_PPC_BPA) += bpa_setup.o bpa_iommu.o bpa_nvram.o \ + bpa_iic.o spider-pic.o obj-$(CONFIG_EEH) += eeh.o obj-$(CONFIG_PROC_FS) += proc_ppc64.o diff --git a/arch/ppc64/kernel/bpa_iommu.c b/arch/ppc64/kernel/bpa_iommu.c new file mode 100644 index 000000000000..f33a7bccb0d7 --- /dev/null +++ b/arch/ppc64/kernel/bpa_iommu.c @@ -0,0 +1,377 @@ +/* + * IOMMU implementation for Broadband Processor Architecture + * We just establish a linear mapping at boot by setting all the + * IOPT cache entries in the CPU. + * The mapping functions should be identical to pci_direct_iommu, + * except for the handling of the high order bit that is required + * by the Spider bridge. These should be split into a separate + * file at the point where we get a different bridge chip. + * + * Copyright (C) 2005 IBM Deutschland Entwicklung GmbH, + * Arnd Bergmann + * + * Based on linear mapping + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pci.h" +#include "bpa_iommu.h" + +static inline unsigned long +get_iopt_entry(unsigned long real_address, unsigned long ioid, + unsigned long prot) +{ + return (prot & IOPT_PROT_MASK) + | (IOPT_COHERENT) + | (IOPT_ORDER_VC) + | (real_address & IOPT_RPN_MASK) + | (ioid & IOPT_IOID_MASK); +} + +typedef struct { + unsigned long val; +} ioste; + +static inline ioste +mk_ioste(unsigned long val) +{ + ioste ioste = { .val = val, }; + return ioste; +} + +static inline ioste +get_iost_entry(unsigned long iopt_base, unsigned long io_address, unsigned page_size) +{ + unsigned long ps; + unsigned long iostep; + unsigned long nnpt; + unsigned long shift; + + switch (page_size) { + case 0x1000000: + ps = IOST_PS_16M; + nnpt = 0; /* one page per segment */ + shift = 5; /* segment has 16 iopt entries */ + break; + + case 0x100000: + ps = IOST_PS_1M; + nnpt = 0; /* one page per segment */ + shift = 1; /* segment has 256 iopt entries */ + break; + + case 0x10000: + ps = IOST_PS_64K; + nnpt = 0x07; /* 8 pages per io page table */ + shift = 0; /* all entries are used */ + break; + + case 0x1000: + ps = IOST_PS_4K; + nnpt = 0x7f; /* 128 pages per io page table */ + shift = 0; /* all entries are used */ + break; + + default: /* not a known compile time constant */ + BUILD_BUG_ON(1); + break; + } + + iostep = iopt_base + + /* need 8 bytes per iopte */ + (((io_address / page_size * 8) + /* align io page tables on 4k page boundaries */ + << shift) + /* nnpt+1 pages go into each iopt */ + & ~(nnpt << 12)); + + nnpt++; /* this seems to work, but the documentation is not clear + about wether we put nnpt or nnpt-1 into the ioste bits. + In theory, this can't work for 4k pages. */ + return mk_ioste(IOST_VALID_MASK + | (iostep & IOST_PT_BASE_MASK) + | ((nnpt << 5) & IOST_NNPT_MASK) + | (ps & IOST_PS_MASK)); +} + +/* compute the address of an io pte */ +static inline unsigned long +get_ioptep(ioste iost_entry, unsigned long io_address) +{ + unsigned long iopt_base; + unsigned long page_size; + unsigned long page_number; + unsigned long iopt_offset; + + iopt_base = iost_entry.val & IOST_PT_BASE_MASK; + page_size = iost_entry.val & IOST_PS_MASK; + + /* decode page size to compute page number */ + page_number = (io_address & 0x0fffffff) >> (10 + 2 * page_size); + /* page number is an offset into the io page table */ + iopt_offset = (page_number << 3) & 0x7fff8ul; + return iopt_base + iopt_offset; +} + +/* compute the tag field of the iopt cache entry */ +static inline unsigned long +get_ioc_tag(ioste iost_entry, unsigned long io_address) +{ + unsigned long iopte = get_ioptep(iost_entry, io_address); + + return IOPT_VALID_MASK + | ((iopte & 0x00000000000000ff8ul) >> 3) + | ((iopte & 0x0000003fffffc0000ul) >> 9); +} + +/* compute the hashed 6 bit index for the 4-way associative pte cache */ +static inline unsigned long +get_ioc_hash(ioste iost_entry, unsigned long io_address) +{ + unsigned long iopte = get_ioptep(iost_entry, io_address); + + return ((iopte & 0x000000000000001f8ul) >> 3) + ^ ((iopte & 0x00000000000020000ul) >> 17) + ^ ((iopte & 0x00000000000010000ul) >> 15) + ^ ((iopte & 0x00000000000008000ul) >> 13) + ^ ((iopte & 0x00000000000004000ul) >> 11) + ^ ((iopte & 0x00000000000002000ul) >> 9) + ^ ((iopte & 0x00000000000001000ul) >> 7); +} + +/* same as above, but pretend that we have a simpler 1-way associative + pte cache with an 8 bit index */ +static inline unsigned long +get_ioc_hash_1way(ioste iost_entry, unsigned long io_address) +{ + unsigned long iopte = get_ioptep(iost_entry, io_address); + + return ((iopte & 0x000000000000001f8ul) >> 3) + ^ ((iopte & 0x00000000000020000ul) >> 17) + ^ ((iopte & 0x00000000000010000ul) >> 15) + ^ ((iopte & 0x00000000000008000ul) >> 13) + ^ ((iopte & 0x00000000000004000ul) >> 11) + ^ ((iopte & 0x00000000000002000ul) >> 9) + ^ ((iopte & 0x00000000000001000ul) >> 7) + ^ ((iopte & 0x0000000000000c000ul) >> 8); +} + +static inline ioste +get_iost_cache(void __iomem *base, unsigned long index) +{ + unsigned long __iomem *p = (base + IOC_ST_CACHE_DIR); + return mk_ioste(in_be64(&p[index])); +} + +static inline void +set_iost_cache(void __iomem *base, unsigned long index, ioste ste) +{ + unsigned long __iomem *p = (base + IOC_ST_CACHE_DIR); + pr_debug("ioste %02lx was %016lx, store %016lx", index, + get_iost_cache(base, index).val, ste.val); + out_be64(&p[index], ste.val); + pr_debug(" now %016lx\n", get_iost_cache(base, index).val); +} + +static inline unsigned long +get_iopt_cache(void __iomem *base, unsigned long index, unsigned long *tag) +{ + unsigned long __iomem *tags = (void *)(base + IOC_PT_CACHE_DIR); + unsigned long __iomem *p = (void *)(base + IOC_PT_CACHE_REG); + + *tag = tags[index]; + rmb(); + return *p; +} + +static inline void +set_iopt_cache(void __iomem *base, unsigned long index, + unsigned long tag, unsigned long val) +{ + unsigned long __iomem *tags = base + IOC_PT_CACHE_DIR; + unsigned long __iomem *p = base + IOC_PT_CACHE_REG; + pr_debug("iopt %02lx was v%016lx/t%016lx, store v%016lx/t%016lx\n", + index, get_iopt_cache(base, index, &oldtag), oldtag, val, tag); + + out_be64(p, val); + out_be64(&tags[index], tag); +} + +static inline void +set_iost_origin(void __iomem *base) +{ + unsigned long __iomem *p = base + IOC_ST_ORIGIN; + unsigned long origin = IOSTO_ENABLE | IOSTO_SW; + + pr_debug("iost_origin %016lx, now %016lx\n", in_be64(p), origin); + out_be64(p, origin); +} + +static inline void +set_iocmd_config(void __iomem *base) +{ + unsigned long __iomem *p = base + 0xc00; + unsigned long conf; + + conf = in_be64(p); + pr_debug("iost_conf %016lx, now %016lx\n", conf, conf | IOCMD_CONF_TE); + out_be64(p, conf | IOCMD_CONF_TE); +} + +/* FIXME: get these from the device tree */ +#define ioc_base 0x20000511000ull +#define ioc_mmio_base 0x20000510000ull +#define ioid 0x48a +#define iopt_phys_offset (- 0x20000000) /* We have a 512MB offset from the SB */ +#define io_page_size 0x1000000 + +static unsigned long map_iopt_entry(unsigned long address) +{ + switch (address >> 20) { + case 0x600: + address = 0x24020000000ull; /* spider i/o */ + break; + default: + address += iopt_phys_offset; + break; + } + + return get_iopt_entry(address, ioid, IOPT_PROT_RW); +} + +static void iommu_bus_setup_null(struct pci_bus *b) { } +static void iommu_dev_setup_null(struct pci_dev *d) { } + +/* initialize the iommu to support a simple linear mapping + * for each DMA window used by any device. For now, we + * happen to know that there is only one DMA window in use, + * starting at iopt_phys_offset. */ +static void bpa_map_iommu(void) +{ + unsigned long address; + void __iomem *base; + ioste ioste; + unsigned long index; + + base = __ioremap(ioc_base, 0x1000, _PAGE_NO_CACHE); + pr_debug("%lx mapped to %p\n", ioc_base, base); + set_iocmd_config(base); + iounmap(base); + + base = __ioremap(ioc_mmio_base, 0x1000, _PAGE_NO_CACHE); + pr_debug("%lx mapped to %p\n", ioc_mmio_base, base); + + set_iost_origin(base); + + for (address = 0; address < 0x100000000ul; address += io_page_size) { + ioste = get_iost_entry(0x10000000000ul, address, io_page_size); + if ((address & 0xfffffff) == 0) /* segment start */ + set_iost_cache(base, address >> 28, ioste); + index = get_ioc_hash_1way(ioste, address); + pr_debug("addr %08lx, index %02lx, ioste %016lx\n", + address, index, ioste.val); + set_iopt_cache(base, + get_ioc_hash_1way(ioste, address), + get_ioc_tag(ioste, address), + map_iopt_entry(address)); + } + iounmap(base); +} + + +static void *bpa_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, unsigned int __nocast flag) +{ + void *ret; + + ret = (void *)__get_free_pages(flag, get_order(size)); + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_abs(ret) | BPA_DMA_VALID; + } + return ret; +} + +static void bpa_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static dma_addr_t bpa_map_single(struct device *hwdev, void *ptr, + size_t size, enum dma_data_direction direction) +{ + return virt_to_abs(ptr) | BPA_DMA_VALID; +} + +static void bpa_unmap_single(struct device *hwdev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction direction) +{ +} + +static int bpa_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + int i; + + for (i = 0; i < nents; i++, sg++) { + sg->dma_address = (page_to_phys(sg->page) + sg->offset) + | BPA_DMA_VALID; + sg->dma_length = sg->length; + } + + return nents; +} + +static void bpa_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ +} + +static int bpa_dma_supported(struct device *dev, u64 mask) +{ + return mask < 0x100000000ull; +} + +void bpa_init_iommu(void) +{ + bpa_map_iommu(); + + /* Direct I/O, IOMMU off */ + ppc_md.iommu_dev_setup = iommu_dev_setup_null; + ppc_md.iommu_bus_setup = iommu_bus_setup_null; + + pci_dma_ops.alloc_coherent = bpa_alloc_coherent; + pci_dma_ops.free_coherent = bpa_free_coherent; + pci_dma_ops.map_single = bpa_map_single; + pci_dma_ops.unmap_single = bpa_unmap_single; + pci_dma_ops.map_sg = bpa_map_sg; + pci_dma_ops.unmap_sg = bpa_unmap_sg; + pci_dma_ops.dma_supported = bpa_dma_supported; +} diff --git a/arch/ppc64/kernel/bpa_iommu.h b/arch/ppc64/kernel/bpa_iommu.h new file mode 100644 index 000000000000..e547d77dfa04 --- /dev/null +++ b/arch/ppc64/kernel/bpa_iommu.h @@ -0,0 +1,65 @@ +#ifndef BPA_IOMMU_H +#define BPA_IOMMU_H + +/* some constants */ +enum { + /* segment table entries */ + IOST_VALID_MASK = 0x8000000000000000ul, + IOST_TAG_MASK = 0x3000000000000000ul, + IOST_PT_BASE_MASK = 0x000003fffffff000ul, + IOST_NNPT_MASK = 0x0000000000000fe0ul, + IOST_PS_MASK = 0x000000000000000ful, + + IOST_PS_4K = 0x1, + IOST_PS_64K = 0x3, + IOST_PS_1M = 0x5, + IOST_PS_16M = 0x7, + + /* iopt tag register */ + IOPT_VALID_MASK = 0x0000000200000000ul, + IOPT_TAG_MASK = 0x00000001fffffffful, + + /* iopt cache register */ + IOPT_PROT_MASK = 0xc000000000000000ul, + IOPT_PROT_NONE = 0x0000000000000000ul, + IOPT_PROT_READ = 0x4000000000000000ul, + IOPT_PROT_WRITE = 0x8000000000000000ul, + IOPT_PROT_RW = 0xc000000000000000ul, + IOPT_COHERENT = 0x2000000000000000ul, + + IOPT_ORDER_MASK = 0x1800000000000000ul, + /* order access to same IOID/VC on same address */ + IOPT_ORDER_ADDR = 0x0800000000000000ul, + /* similar, but only after a write access */ + IOPT_ORDER_WRITES = 0x1000000000000000ul, + /* Order all accesses to same IOID/VC */ + IOPT_ORDER_VC = 0x1800000000000000ul, + + IOPT_RPN_MASK = 0x000003fffffff000ul, + IOPT_HINT_MASK = 0x0000000000000800ul, + IOPT_IOID_MASK = 0x00000000000007fful, + + IOSTO_ENABLE = 0x8000000000000000ul, + IOSTO_ORIGIN = 0x000003fffffff000ul, + IOSTO_HW = 0x0000000000000800ul, + IOSTO_SW = 0x0000000000000400ul, + + IOCMD_CONF_TE = 0x0000800000000000ul, + + /* memory mapped registers */ + IOC_PT_CACHE_DIR = 0x000, + IOC_ST_CACHE_DIR = 0x800, + IOC_PT_CACHE_REG = 0x910, + IOC_ST_ORIGIN = 0x918, + IOC_CONF = 0x930, + + /* The high bit needs to be set on every DMA address, + only 2GB are addressable */ + BPA_DMA_VALID = 0x80000000, + BPA_DMA_MASK = 0x7fffffff, +}; + + +void bpa_init_iommu(void); + +#endif diff --git a/arch/ppc64/kernel/bpa_setup.c b/arch/ppc64/kernel/bpa_setup.c index 0a43d8a93d76..57b3db66f458 100644 --- a/arch/ppc64/kernel/bpa_setup.c +++ b/arch/ppc64/kernel/bpa_setup.c @@ -46,6 +46,7 @@ #include "pci.h" #include "bpa_iic.h" +#include "bpa_iommu.h" #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -107,7 +108,7 @@ static void __init bpa_init_early(void) hpte_init_native(); - pci_direct_iommu_init(); + bpa_init_iommu(); ppc64_interrupt_controller = IC_BPA_IIC; From b7c84c6ada2be942eca6722edb2cfaad412cd5de Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Jun 2005 20:26:07 -0700 Subject: [PATCH 018/199] [PATCH] boot_pageset must not be freed. The boot_pageset needs to be preserved for hotplugging and for off line processors and nodes. Otherwise pointers will point into memory that has now a different use. /proc/zoneinfo is currently showing strange results if processors / nodes are not present. Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 206920796f5f..559336de9687 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1742,10 +1742,17 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * with interrupts disabled. * * Some NUMA counter updates may also be caught by the boot pagesets. - * These will be discarded when bootup is complete. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. */ static struct per_cpu_pageset - boot_pageset[NR_CPUS] __initdata; + boot_pageset[NR_CPUS]; /* * Dynamically allocate memory for the From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:04:55 -0700 Subject: [PATCH 019/199] [NETPOLL]: Set poll_owner to -1 before unlocking in netpoll_poll_unlock() This trivial patch moves the assignment of poll_owner to -1 inside of the lock. This fixes a potential SMP race in the code. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index c0d8b90c5202..449a4fde6587 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev) static inline void netpoll_poll_unlock(struct net_device *dev) { if (dev->np) { - spin_unlock(&dev->np->poll_lock); dev->np->poll_owner = -1; + spin_unlock(&dev->np->poll_lock); } } From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:31 -0700 Subject: [PATCH 020/199] [NETPOLL]: Introduce a netpoll_info struct This patch introduces a netpoll_info structure, which the struct net_device will now point to instead of pointing to a struct netpoll. The reason for this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock should be maintained per net_device, not per netpoll; and 2) this is a first step in providing support for multiple netpoll clients to register against the same net_device. The struct netpoll is now pointed to by the netpoll_info structure. As such, the previous behaviour of the code is preserved. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +-- include/linux/netpoll.h | 25 +++++++++++------ net/core/netpoll.c | 57 ++++++++++++++++++++++++++------------- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba5d1236aa17..d6afd440cf7b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -41,7 +41,7 @@ struct divert_blk; struct vlan_group; struct ethtool_ops; -struct netpoll; +struct netpoll_info; /* source back-compat hooks */ #define SET_ETHTOOL_OPS(netdev,ops) \ ( (netdev)->ethtool_ops = (ops) ) @@ -468,7 +468,7 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); #ifdef CONFIG_NETPOLL - struct netpoll *np; + struct netpoll_info *npinfo; #endif #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 449a4fde6587..388cd91bc7a6 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -16,14 +16,18 @@ struct netpoll; struct netpoll { struct net_device *dev; char dev_name[16], *name; - int rx_flags; void (*rx_hook)(struct netpoll *, int, char *, int); void (*drop)(struct sk_buff *skb); u32 local_ip, remote_ip; u16 local_port, remote_port; unsigned char local_mac[6], remote_mac[6]; +}; + +struct netpoll_info { spinlock_t poll_lock; int poll_owner; + int rx_flags; + struct netpoll *np; }; void netpoll_poll(struct netpoll *np); @@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline int netpoll_rx(struct sk_buff *skb) { - return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb); + struct netpoll_info *npinfo = skb->dev->npinfo; + + if (!npinfo || !npinfo->rx_flags) + return 0; + + return npinfo->np && __netpoll_rx(skb); } static inline void netpoll_poll_lock(struct net_device *dev) { - if (dev->np) { - spin_lock(&dev->np->poll_lock); - dev->np->poll_owner = smp_processor_id(); + if (dev->npinfo) { + spin_lock(&dev->npinfo->poll_lock); + dev->npinfo->poll_owner = smp_processor_id(); } } static inline void netpoll_poll_unlock(struct net_device *dev) { - if (dev->np) { - dev->np->poll_owner = -1; - spin_unlock(&dev->np->poll_lock); + if (dev->npinfo) { + dev->npinfo->poll_owner = -1; + spin_unlock(&dev->npinfo->poll_lock); } } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a119696d5521..ab3c0c9713b0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -130,19 +130,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, */ static void poll_napi(struct netpoll *np) { + struct netpoll_info *npinfo = np->dev->npinfo; int budget = 16; if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && - np->poll_owner != smp_processor_id() && - spin_trylock(&np->poll_lock)) { - np->rx_flags |= NETPOLL_RX_DROP; + npinfo->poll_owner != smp_processor_id() && + spin_trylock(&npinfo->poll_lock)) { + npinfo->rx_flags |= NETPOLL_RX_DROP; atomic_inc(&trapped); np->dev->poll(np->dev, &budget); atomic_dec(&trapped); - np->rx_flags &= ~NETPOLL_RX_DROP; - spin_unlock(&np->poll_lock); + npinfo->rx_flags &= ~NETPOLL_RX_DROP; + spin_unlock(&npinfo->poll_lock); } } @@ -245,6 +246,7 @@ static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { int status; + struct netpoll_info *npinfo; repeat: if(!np || !np->dev || !netif_running(np->dev)) { @@ -253,8 +255,9 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } /* avoid recursion */ - if(np->poll_owner == smp_processor_id() || - np->dev->xmit_lock_owner == smp_processor_id()) { + npinfo = np->dev->npinfo; + if (npinfo->poll_owner == smp_processor_id() || + np->dev->xmit_lock_owner == smp_processor_id()) { if (np->drop) np->drop(skb); else @@ -341,14 +344,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static void arp_reply(struct sk_buff *skb) { + struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; u32 sip, tip; struct sk_buff *send_skb; - struct netpoll *np = skb->dev->np; + struct netpoll *np = NULL; - if (!np) return; + if (npinfo) + np = npinfo->np; + if (!np) + return; /* No arp on this interface */ if (skb->dev->flags & IFF_NOARP) @@ -429,7 +436,7 @@ int __netpoll_rx(struct sk_buff *skb) int proto, len, ulen; struct iphdr *iph; struct udphdr *uh; - struct netpoll *np = skb->dev->np; + struct netpoll *np = skb->dev->npinfo->np; if (!np->rx_hook) goto out; @@ -611,9 +618,7 @@ int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; struct in_device *in_dev; - - np->poll_lock = SPIN_LOCK_UNLOCKED; - np->poll_owner = -1; + struct netpoll_info *npinfo; if (np->dev_name) ndev = dev_get_by_name(np->dev_name); @@ -624,7 +629,16 @@ int netpoll_setup(struct netpoll *np) } np->dev = ndev; - ndev->np = np; + if (!ndev->npinfo) { + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + if (!npinfo) + goto release; + + npinfo->np = NULL; + npinfo->poll_lock = SPIN_LOCK_UNLOCKED; + npinfo->poll_owner = -1; + } else + npinfo = ndev->npinfo; if (!ndev->poll_controller) { printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", @@ -693,12 +707,15 @@ int netpoll_setup(struct netpoll *np) } if(np->rx_hook) - np->rx_flags = NETPOLL_RX_ENABLED; + npinfo->rx_flags = NETPOLL_RX_ENABLED; + npinfo->np = np; + ndev->npinfo = npinfo; return 0; release: - ndev->np = NULL; + if (!ndev->npinfo) + kfree(npinfo); np->dev = NULL; dev_put(ndev); return -1; @@ -706,9 +723,11 @@ int netpoll_setup(struct netpoll *np) void netpoll_cleanup(struct netpoll *np) { - if (np->dev) - np->dev->np = NULL; - dev_put(np->dev); + if (np->dev) { + if (np->dev->npinfo) + np->dev->npinfo->np = NULL; + dev_put(np->dev); + } np->dev = NULL; } From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:59 -0700 Subject: [PATCH 021/199] [NETPOLL]: allow multiple netpoll_clients to register against one interface This patch provides support for registering multiple netpoll clients to the same network device. Only one of these clients may register an rx_hook, however. In practice, this restriction has not been problematic. It is worth mentioning, though, that the current design can be easily extended to allow for the registration of multiple rx_hooks. The basic idea of the patch is that the rx_np pointer in the netpoll_info structure points to the struct netpoll that has rx_hook filled in. Aside from this one case, there is no need for a pointer from the struct net_device to an individual struct netpoll. A lock is introduced to protect the setting and clearing of the np_rx pointer. The pointer will only be cleared upon netpoll client module removal, and the lock should be uncontested. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 15 ++++++++++++--- net/core/netpoll.c | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 388cd91bc7a6..bcd0ac33f592 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -27,7 +27,8 @@ struct netpoll_info { spinlock_t poll_lock; int poll_owner; int rx_flags; - struct netpoll *np; + spinlock_t rx_lock; + struct netpoll *rx_np; /* netpoll that registered an rx_hook */ }; void netpoll_poll(struct netpoll *np); @@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb); static inline int netpoll_rx(struct sk_buff *skb) { struct netpoll_info *npinfo = skb->dev->npinfo; + unsigned long flags; + int ret = 0; - if (!npinfo || !npinfo->rx_flags) + if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags)) return 0; - return npinfo->np && __netpoll_rx(skb); + spin_lock_irqsave(&npinfo->rx_lock, flags); + /* check rx_flags again with the lock held */ + if (npinfo->rx_flags && __netpoll_rx(skb)) + ret = 1; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + + return ret; } static inline void netpoll_poll_lock(struct net_device *dev) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index ab3c0c9713b0..c327c9edadc5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -349,11 +349,15 @@ static void arp_reply(struct sk_buff *skb) unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; u32 sip, tip; + unsigned long flags; struct sk_buff *send_skb; struct netpoll *np = NULL; - if (npinfo) - np = npinfo->np; + spin_lock_irqsave(&npinfo->rx_lock, flags); + if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) + np = npinfo->rx_np; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + if (!np) return; @@ -436,9 +440,9 @@ int __netpoll_rx(struct sk_buff *skb) int proto, len, ulen; struct iphdr *iph; struct udphdr *uh; - struct netpoll *np = skb->dev->npinfo->np; + struct netpoll *np = skb->dev->npinfo->rx_np; - if (!np->rx_hook) + if (!np) goto out; if (skb->dev->type != ARPHRD_ETHER) goto out; @@ -619,6 +623,7 @@ int netpoll_setup(struct netpoll *np) struct net_device *ndev = NULL; struct in_device *in_dev; struct netpoll_info *npinfo; + unsigned long flags; if (np->dev_name) ndev = dev_get_by_name(np->dev_name); @@ -634,9 +639,10 @@ int netpoll_setup(struct netpoll *np) if (!npinfo) goto release; - npinfo->np = NULL; + npinfo->rx_np = NULL; npinfo->poll_lock = SPIN_LOCK_UNLOCKED; npinfo->poll_owner = -1; + npinfo->rx_lock = SPIN_LOCK_UNLOCKED; } else npinfo = ndev->npinfo; @@ -706,9 +712,13 @@ int netpoll_setup(struct netpoll *np) np->name, HIPQUAD(np->local_ip)); } - if(np->rx_hook) - npinfo->rx_flags = NETPOLL_RX_ENABLED; - npinfo->np = np; + if (np->rx_hook) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + npinfo->rx_flags |= NETPOLL_RX_ENABLED; + npinfo->rx_np = np; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + /* last thing to do is link it to the net device structure */ ndev->npinfo = npinfo; return 0; @@ -723,11 +733,20 @@ int netpoll_setup(struct netpoll *np) void netpoll_cleanup(struct netpoll *np) { + struct netpoll_info *npinfo; + unsigned long flags; + if (np->dev) { - if (np->dev->npinfo) - np->dev->npinfo->np = NULL; + npinfo = np->dev->npinfo; + if (npinfo && npinfo->rx_np == np) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + npinfo->rx_np = NULL; + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } dev_put(np->dev); } + np->dev = NULL; } From 7abaa27c1c54208bd76fa8bae55839c034aebfb2 Mon Sep 17 00:00:00 2001 From: Chuck Short Date: Wed, 22 Jun 2005 22:10:23 -0700 Subject: [PATCH 022/199] [IPV4]: Fix route.c gcc4 warnings Signed-off by: Chuck Short Signed-off-by: David S. Miller --- net/ipv4/route.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f4d53c919869..80cf633d9f4a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1767,7 +1767,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb, struct in_device *in_dev, u32 daddr, u32 saddr, u32 tos) { - struct rtable* rth; + struct rtable* rth = NULL; int err; unsigned hash; @@ -1794,7 +1794,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb, u32 daddr, u32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - struct rtable* rth; + struct rtable* rth = NULL; unsigned char hop, hopcount, lasthop; int err = -EINVAL; unsigned int hash; @@ -2239,7 +2239,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp, struct net_device *dev_out, unsigned flags) { - struct rtable *rth; + struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { @@ -2267,7 +2267,7 @@ static inline int ip_mkroute_output(struct rtable** rp, unsigned char hop; unsigned hash; int err = -EINVAL; - struct rtable *rth; + struct rtable *rth = NULL; if (res->fi && res->fi->fib_nhs > 1) { unsigned char hopcount = res->fi->fib_nhs; From 285b3afefacff14bc98e5754b8b48a0a2b42f0df Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 22 Jun 2005 22:11:44 -0700 Subject: [PATCH 023/199] [ATALK] aarp: replace schedule_timeout() with msleep() From: Nishanth Aravamudan Use msleep() instead of schedule_timeout() to guarantee the task delays as expected. The current code is not wrong, but it does not account for early return due to signals, so I think msleep() should be appropriate. Signed-off-by: Nishanth Aravamudan Signed-off-by: Domen Puncer Signed-off-by: David S. Miller --- net/appletalk/aarp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 10d040461021..c34614ea5fce 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif) aarp_send_probe(atif->dev, &atif->address); /* Defer 1/10th */ - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ / 10); + msleep(100); if (atif->status & ATIF_PROBE_FAIL) break; @@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) aarp_send_probe(atif->dev, sa); /* Defer 1/10th */ - current->state = TASK_INTERRUPTIBLE; write_unlock_bh(&aarp_lock); - schedule_timeout(HZ / 10); + msleep(100); write_lock_bh(&aarp_lock); if (entry->status & ATIF_PROBE_FAIL) From 68d318720052154bc6b2513b0f15d0d947cc53c9 Mon Sep 17 00:00:00 2001 From: James Lamanna Date: Wed, 22 Jun 2005 22:12:57 -0700 Subject: [PATCH 024/199] [EBTABLES]: vfree() checking cleanups From: jlamanna@gmail.com ebtables.c vfree() checking cleanups. Signed-off by: James Lamanna Signed-off-by: Domen Puncer Signed-off-by: David S. Miller --- net/bridge/netfilter/ebtables.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 18ebc664769b..c4540144f0f4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl, if (repl->valid_hooks & (1 << i)) if (check_chainloops(newinfo->hook_entry[i], cl_s, udc_cnt, i, newinfo->entries)) { - if (cl_s) - vfree(cl_s); + vfree(cl_s); return -EINVAL; } @@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl, EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, &i); } - if (cl_s) - vfree(cl_s); + vfree(cl_s); return ret; } @@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len) } vfree(table); - if (counterstmp) - vfree(counterstmp); + vfree(counterstmp); return ret; free_unlock: @@ -1040,8 +1037,7 @@ static int do_replace(void __user *user, unsigned int len) EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, NULL); free_counterstmp: - if (counterstmp) - vfree(counterstmp); + vfree(counterstmp); /* can be initialized in translate_table() */ if (newinfo->chainstack) { for (i = 0; i < num_possible_cpus(); i++) @@ -1049,11 +1045,9 @@ static int do_replace(void __user *user, unsigned int len) vfree(newinfo->chainstack); } free_entries: - if (newinfo->entries) - vfree(newinfo->entries); + vfree(newinfo->entries); free_newinfo: - if (newinfo) - vfree(newinfo); + vfree(newinfo); return ret; } @@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table) down(&ebt_mutex); LIST_DELETE(&ebt_tables, table); up(&ebt_mutex); - if (table->private->entries) - vfree(table->private->entries); + vfree(table->private->entries); if (table->private->chainstack) { for (i = 0; i < num_possible_cpus(); i++) vfree(table->private->chainstack[i]); From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:15:01 -0700 Subject: [PATCH 025/199] [X25]: Selective sub-address matching with call user data. From: Shaun Pereira This is the first (independent of the second) patch of two that I am working on with x25 on linux (tested with xot on a cisco router). Details are as follows. Current state of module: A server using the current implementation (2.6.11.7) of the x25 module will accept a call request/ incoming call packet at the listening x.25 address, from all callers to that address, as long as NO call user data is present in the packet header. If the server needs to choose to accept a particular call request/ incoming call packet arriving at its listening x25 address, then the kernel has to allow a match of call user data present in the call request packet with its own. This is required when multiple servers listen at the same x25 address and device interface. The kernel currently matches ALL call user data, if present. Current Changes: This patch is a follow up to the patch submitted previously by Andrew Hendry, and allows the user to selectively control the number of octets of call user data in the call request packet, that the kernel will match. By default no call user data is matched, even if call user data is present. To allow call user data matching, a cudmatchlength > 0 has to be passed into the kernel after which the passed number of octets will be matched. Otherwise the kernel behavior is exactly as the original implementation. This patch also ensures that as is normally the case, no call user data will be present in the Call accepted / call connected packet sent back to the caller Future Changes on next patch: There are cases however when call user data may be present in the call accepted packet. According to the X.25 recommendation (ITU-T 10/96) section 5.2.3.2 call user data may be present in the call accepted packet provided the fast select facility is used. My next patch will include this fast select utility and the ability to send up to 128 octets call user data in the call accepted packet provided the fast select facility is used. I am currently testing this, again with xot on linux and cisco. Signed-off-by: Shaun Pereira (With a fix from Alexey Dobriyan ) Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 10 +++++++ include/net/x25.h | 3 +- net/x25/af_x25.c | 73 +++++++++++++++++++++++++++++---------------- net/x25/x25_subr.c | 18 ----------- 4 files changed, 59 insertions(+), 45 deletions(-) diff --git a/include/linux/x25.h b/include/linux/x25.h index 7531cfed5885..6f43b3d20248 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -4,6 +4,8 @@ * History * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. + * apr/02/05 Shaun Pereira Selective sub address matching with + * call user data */ #ifndef X25_KERNEL_H @@ -16,6 +18,7 @@ #define SIOCX25GCALLUSERDATA (SIOCPROTOPRIVATE + 4) #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) +#define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) /* * Values for {get,set}sockopt. @@ -109,4 +112,11 @@ struct x25_causediag { unsigned char diagnostic; }; +/* + * Further optional call user data match length selection + */ +struct x25_subaddr { + unsigned int cudmatchlength; +}; + #endif diff --git a/include/net/x25.h b/include/net/x25.h index 7a1ba5bbb868..9dd70dd4a9b7 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -134,7 +134,7 @@ struct x25_sock { struct sock sk; struct x25_address source_addr, dest_addr; struct x25_neigh *neighbour; - unsigned int lci; + unsigned int lci, cudmatchlength; unsigned char state, condition, qbitincl, intflag; unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; @@ -242,7 +242,6 @@ extern int x25_validate_nr(struct sock *, unsigned short); extern void x25_write_internal(struct sock *, int); extern int x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *); extern void x25_disconnect(struct sock *, int, unsigned char, unsigned char); -extern int x25_check_calluserdata(struct x25_calluserdata *,struct x25_calluserdata *); /* x25_timer.c */ extern void x25_start_heartbeat(struct sock *); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 2a24b243b841..e17d84a55d5e 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -29,6 +29,8 @@ * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file + * 2005-04-02 Shaun Pereira Selective sub address matching + * with call user data */ #include @@ -219,7 +221,8 @@ static void x25_insert_socket(struct sock *sk) * Note: if a listening socket has cud set it must only get calls * with matching cud. */ -static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata) +static struct sock *x25_find_listener(struct x25_address *addr, + struct sk_buff *skb) { struct sock *s; struct sock *next_best; @@ -230,22 +233,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu sk_for_each(s, node, &x25_list) if ((!strcmp(addr->x25_addr, - x25_sk(s)->source_addr.x25_addr) || - !strcmp(addr->x25_addr, - null_x25_address.x25_addr)) && - s->sk_state == TCP_LISTEN) { - + x25_sk(s)->source_addr.x25_addr) || + !strcmp(addr->x25_addr, + null_x25_address.x25_addr)) && + s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ - if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) { - sock_hold(s); - goto found; - } - if (x25_sk(s)->calluserdata.cudlength == 0) { + if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) { + if((memcmp(x25_sk(s)->calluserdata.cuddata, + skb->data, + x25_sk(s)->cudmatchlength)) == 0) { + sock_hold(s); + goto found; + } + } else next_best = s; - } } if (next_best) { s = next_best; @@ -497,6 +501,7 @@ static int x25_create(struct socket *sock, int protocol) x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; + x25->cudmatchlength = 0; x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; @@ -545,6 +550,7 @@ static struct sock *x25_make_new(struct sock *osk) x25->t2 = ox25->t2; x25->facilities = ox25->facilities; x25->qbitincl = ox25->qbitincl; + x25->cudmatchlength = ox25->cudmatchlength; x25_init_timers(sk); out: @@ -822,7 +828,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; - struct x25_calluserdata calluserdata; int len, rc; /* @@ -844,20 +849,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, len = skb->data[0] + 1; skb_pull(skb,len); - /* - * Incoming Call User Data. - */ - if (skb->len >= 0) { - memcpy(calluserdata.cuddata, skb->data, skb->len); - calluserdata.cudlength = skb->len; - } - - skb_push(skb,len); - /* * Find a listener for the particular address/cud pair. */ - sk = x25_find_listener(&source_addr,&calluserdata); + sk = x25_find_listener(&source_addr,skb); + skb_push(skb,len); /* * We can't accept the Call Request. @@ -900,12 +896,22 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->neighbour = nb; makex25->facilities = facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; - makex25->calluserdata = calluserdata; + /* ensure no reverse facil on accept */ + makex25->vc_facil_mask &= ~X25_MASK_REVERSE; + makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; + /* + * Incoming Call User Data. + */ + if (skb->len >= 0) { + memcpy(makex25->calluserdata.cuddata, skb->data, skb->len); + makex25->calluserdata.cudlength = skb->len; + } + sk->sk_ack_backlog++; x25_insert_socket(make); @@ -1325,6 +1331,23 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } + case SIOCX25SCUDMATCHLEN: { + struct x25_subaddr sub_addr; + rc = -EINVAL; + if(sk->sk_state != TCP_CLOSE) + break; + rc = -EFAULT; + if (copy_from_user(&sub_addr, argp, + sizeof(sub_addr))) + break; + rc = -EINVAL; + if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN) + break; + x25->cudmatchlength = sub_addr.cudmatchlength; + rc = 0; + break; + } + default: rc = dev_ioctl(cmd, argp); break; diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 183fea3bba67..c349bbd61684 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -354,21 +354,3 @@ void x25_check_rbuf(struct sock *sk) } } -/* - * Compare 2 calluserdata structures, used to find correct listening sockets - * when call user data is used. - */ -int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs) -{ - int i; - if (ours->cudlength != theirs->cudlength) - return 0; - - for (i=0;icudlength;i++) { - if (ours->cuddata[i] != theirs->cuddata[i]) { - return 0; - } - } - return 1; -} - From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:16:17 -0700 Subject: [PATCH 026/199] [X25]: Fast select with no restriction on response This patch is a follow up to patch 1 regarding "Selective Sub Address matching with call user data". It allows use of the Fast-Select-Acceptance optional user facility for X.25. This patch just implements fast select with no restriction on response (NRR). What this means (according to ITU-T Recomendation 10/96 section 6.16) is that if in an incoming call packet, the relevant facility bits are set for fast-select-NRR, then the called DTE can issue a direct response to the incoming packet using a call-accepted packet that contains call-user-data. This patch allows such a response. The called DTE can also respond with a clear-request packet that contains call-user-data. However, this feature is currently not implemented by the patch. How is Fast Select Acceptance used? By default, the system does not allow fast select acceptance (as before). To enable a response to fast select acceptance, After a listen socket in created and bound as follows socket(AF_X25, SOCK_SEQPACKET, 0); bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr)); but before a listen system call is made, the following ioctl should be used. ioctl(call_soc,SIOCX25CALLACCPTAPPRV); Now the listen system call can be made listen(call_soc, 4); After this, an incoming-call packet will be accepted, but no call-accepted packet will be sent back until the following system call is made on the socket that accepts the call ioctl(vc_soc,SIOCX25SENDCALLACCPT); The network (or cisco xot router used for testing here) will allow the application server's call-user-data in the call-accepted packet, provided the call-request was made with Fast-select NRR. Signed-off-by: Shaun Pereira Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 2 ++ include/net/x25.h | 6 ++++-- net/x25/af_x25.c | 37 +++++++++++++++++++++++++++++++++---- net/x25/x25_facilities.c | 34 +++++++++++++++++++++++++++++----- net/x25/x25_subr.c | 23 ++++++++++++++++++----- 5 files changed, 86 insertions(+), 16 deletions(-) diff --git a/include/linux/x25.h b/include/linux/x25.h index 6f43b3d20248..16d44931afa0 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -19,6 +19,8 @@ #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) #define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) +#define SIOCX25CALLACCPTAPPRV (SIOCPROTOPRIVATE + 8) +#define SIOCX25SENDCALLACCPT (SIOCPROTOPRIVATE + 9) /* * Values for {get,set}sockopt. diff --git a/include/net/x25.h b/include/net/x25.h index 9dd70dd4a9b7..8b39b98876e8 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -79,6 +79,8 @@ enum { #define X25_DEFAULT_PACKET_SIZE X25_PS128 /* Default Packet Size */ #define X25_DEFAULT_THROUGHPUT 0x0A /* Deafult Throughput */ #define X25_DEFAULT_REVERSE 0x00 /* Default Reverse Charging */ +#define X25_DENY_ACCPT_APPRV 0x01 /* Default value */ +#define X25_ALLOW_ACCPT_APPRV 0x00 /* Control enabled */ #define X25_SMODULUS 8 #define X25_EMODULUS 128 @@ -94,7 +96,7 @@ enum { #define X25_FAC_CLASS_C 0x80 #define X25_FAC_CLASS_D 0xC0 -#define X25_FAC_REVERSE 0x01 +#define X25_FAC_REVERSE 0x01 /* also fast select */ #define X25_FAC_THROUGHPUT 0x02 #define X25_FAC_PACKET_SIZE 0x42 #define X25_FAC_WINDOW_SIZE 0x43 @@ -135,7 +137,7 @@ struct x25_sock { struct x25_address source_addr, dest_addr; struct x25_neigh *neighbour; unsigned int lci, cudmatchlength; - unsigned char state, condition, qbitincl, intflag; + unsigned char state, condition, qbitincl, intflag, accptapprv; unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; unsigned short fraglen; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index e17d84a55d5e..04bec047fa9a 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -31,6 +31,8 @@ * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data + * 2005-04-15 Shaun Pereira Fast select with no restriction on + * response */ #include @@ -502,6 +504,8 @@ static int x25_create(struct socket *sock, int protocol) x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; + x25->accptapprv = X25_DENY_ACCPT_APPRV; /* normally no cud */ + /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; @@ -551,6 +555,7 @@ static struct sock *x25_make_new(struct sock *osk) x25->facilities = ox25->facilities; x25->qbitincl = ox25->qbitincl; x25->cudmatchlength = ox25->cudmatchlength; + x25->accptapprv = ox25->accptapprv; x25_init_timers(sk); out: @@ -900,9 +905,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->vc_facil_mask &= ~X25_MASK_REVERSE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; - x25_write_internal(make, X25_CALL_ACCEPTED); - - makex25->state = X25_STATE_3; + /* Normally all calls are accepted immediatly */ + if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) { + x25_write_internal(make, X25_CALL_ACCEPTED); + makex25->state = X25_STATE_3; + } /* * Incoming Call User Data. @@ -1294,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) if (facilities.throughput < 0x03 || facilities.throughput > 0xDD) break; - if (facilities.reverse && facilities.reverse != 1) + if (facilities.reverse && + (facilities.reverse | 0x81)!= 0x81) break; x25->facilities = facilities; rc = 0; @@ -1348,6 +1356,27 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } + case SIOCX25CALLACCPTAPPRV: { + rc = -EINVAL; + if (sk->sk_state != TCP_CLOSE) + break; + x25->accptapprv = X25_ALLOW_ACCPT_APPRV; + rc = 0; + break; + } + + case SIOCX25SENDCALLACCPT: { + rc = -EINVAL; + if (sk->sk_state != TCP_ESTABLISHED) + break; + if (x25->accptapprv) /* must call accptapprv above */ + break; + x25_write_internal(sk, X25_CALL_ACCEPTED); + x25->state = X25_STATE_3; + rc = 0; + break; + } + default: rc = dev_ioctl(cmd, argp); break; diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index a21bdb95f9a8..54278b962f4c 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -17,6 +17,8 @@ * X.25 001 Split from x25_subr.c * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. + * apr/14/05 Shaun Pereira - Allow fast select with no restriction + * on response. */ #include @@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb, case X25_FAC_CLASS_A: switch (*p) { case X25_FAC_REVERSE: - facilities->reverse = p[1] & 0x01; - *vc_fac_mask |= X25_MASK_REVERSE; - break; + if((p[1] & 0x81) == 0x81) { + facilities->reverse = p[1] & 0x81; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if((p[1] & 0x01) == 0x01) { + facilities->reverse = p[1] & 0x01; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if((p[1] & 0x80) == 0x80) { + facilities->reverse = p[1] & 0x80; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if(p[1] == 0x00) { + facilities->reverse + = X25_DEFAULT_REVERSE; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + case X25_FAC_THROUGHPUT: facilities->throughput = p[1]; *vc_fac_mask |= X25_MASK_THROUGHPUT; @@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer, if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { *p++ = X25_FAC_REVERSE; - *p++ = !!facilities->reverse; + *p++ = facilities->reverse; } if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { @@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, /* * They want reverse charging, we won't accept it. */ - if (theirs.reverse && ours->reverse) { + if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) { SOCK_DEBUG(sk, "X.25: rejecting reverse charging request"); return -1; } diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index c349bbd61684..7fd872ad0c20 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -19,6 +19,8 @@ * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups + * apr/04/15 Shaun Pereira Fast select with no + * restriction on response. */ #include @@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype) len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; break; - case X25_CALL_ACCEPTED: - len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; + case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */ + if(x25->facilities.reverse & 0x80) { + len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; + } else { + len += 1 + X25_MAX_FAC_LEN; + } break; case X25_CLEAR_REQUEST: case X25_RESET_REQUEST: @@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype) x25->vc_facil_mask); dptr = skb_put(skb, len); memcpy(dptr, facilities, len); - dptr = skb_put(skb, x25->calluserdata.cudlength); - memcpy(dptr, x25->calluserdata.cuddata, - x25->calluserdata.cudlength); + + /* fast select with no restriction on response + allows call user data. Userland must + ensure it is ours and not theirs */ + if(x25->facilities.reverse & 0x80) { + dptr = skb_put(skb, + x25->calluserdata.cudlength); + memcpy(dptr, x25->calluserdata.cuddata, + x25->calluserdata.cudlength); + } x25->calluserdata.cudlength = 0; break; From 2c4ee8f907fc4a3c69273a958f853bf4b358eb49 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 22 Jun 2005 22:19:52 -0700 Subject: [PATCH 027/199] [LTPC]: Replace schedule_timeout() with ssleep()/msleep() Use ssleep() / msleep() [as appropriate] instead of schedule_timeout() to guarantee the task delays as expected. Signed-off-by: Nishanth Aravamudan Acked-by: Arnaldo Carvalho de Melo Signed-off-by: Maximilian Attems Signed-off-by: Domen Puncer Signed-off-by: David S. Miller --- drivers/net/appletalk/ltpc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c index db4f369637b6..d5666c37cb0d 100644 --- a/drivers/net/appletalk/ltpc.c +++ b/drivers/net/appletalk/ltpc.c @@ -1109,8 +1109,7 @@ struct net_device * __init ltpc_probe(void) inb_p(io+1); inb_p(io+3); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(2*HZ/100); + msleep(20); inb_p(io+0); inb_p(io+2); @@ -1120,8 +1119,7 @@ struct net_device * __init ltpc_probe(void) inb_p(io+5); /* enable dma */ inb_p(io+6); /* tri-state interrupt line */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); + ssleep(1); /* now, figure out which dma channel we're using, unless it's already been specified */ From 479f6ea85e513551510ad52f37e69e1c596ad356 Mon Sep 17 00:00:00 2001 From: Stelian Pop Date: Wed, 22 Jun 2005 17:53:28 +0200 Subject: [PATCH 028/199] [PATCH] USB: fix hid core to return proper error code from probe Drivers need to return -ENODEV when they can't bind to a device. Anything else stops the "bind a device to a driver" search. From: Stelian Pop Signed-off-by: Greg Kroah-Hartman --- drivers/usb/input/hid-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/input/hid-core.c b/drivers/usb/input/hid-core.c index 2d8bd9dcc6ed..740dec1f521d 100644 --- a/drivers/usb/input/hid-core.c +++ b/drivers/usb/input/hid-core.c @@ -1762,7 +1762,7 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id) intf->altsetting->desc.bInterfaceNumber); if (!(hid = usb_hid_configure(intf))) - return -EIO; + return -ENODEV; hid_init_reports(hid); hid_dump_device(hid); @@ -1777,7 +1777,7 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id) if (!hid->claimed) { printk ("HID device not claimed by input or hiddev\n"); hid_disconnect(intf); - return -EIO; + return -ENODEV; } printk(KERN_INFO); From d377e85b537a5e166272f937da6ba84350676b6e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Jun 2005 16:09:05 -0700 Subject: [PATCH 029/199] [PATCH] driver core: Fix up the device_attach() error handling in bus_add_device() Don't error out if something "bad" happens when trying to bind a driver to a device. We want the sysfs attributes to be present for later when we try to tear down the device. Signed-off-by: Greg Kroah-Hartman --- drivers/base/bus.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 43722af90bdd..c3fac7fd555e 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -270,10 +270,9 @@ int bus_add_device(struct device * dev) if (bus) { pr_debug("bus %s: add device %s\n", bus->name, dev->bus_id); - error = device_attach(dev); + device_attach(dev); klist_add_tail(&bus->klist_devices, &dev->knode_bus); - if (error >= 0) - error = device_add_attrs(bus, dev); + error = device_add_attrs(bus, dev); if (!error) { sysfs_create_link(&bus->devices.kobj, &dev->kobj, dev->bus_id); sysfs_create_link(&dev->kobj, &dev->bus->subsys.kset.kobj, "bus"); From 8f586b2243198194240626fd9695da5564ffa7ee Mon Sep 17 00:00:00 2001 From: Mike Strosaker Date: Thu, 23 Jun 2005 16:09:41 +1000 Subject: [PATCH 030/199] [PATCH] correct printing to operator panel This patch corrects the printing of progress indicators to the op panel on p/iSeries ppc64 systems. Each discrete reference code should begin with a form feed char to clear the op panel, and the first and second lines should be separated with a CR/LF sequence. Padding with spaces is not necessary. Also, capitalize the hex value printed on the first line, to be consistent with the values printed by firmware, service processor, etc. It turns out that there's an ibm,form-feed property; this patch uses it in the pSeries-specific progress routine. This patch also checks the number of rows and the specific width of each row (the second row on power5 systems can actually hold 80 characters). If the displayed text is too wide for the physical display, it can be viewed in the ASM menus, or by selecting option 14 on the op panel. Signed-off-by: Mike Strosaker Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/rtas.c | 59 ++++++++++++++++++++++++--------------- arch/ppc64/kernel/setup.c | 6 ++-- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c index 43e1518653d5..5e8eb33b8e54 100644 --- a/arch/ppc64/kernel/rtas.c +++ b/arch/ppc64/kernel/rtas.c @@ -98,21 +98,29 @@ rtas_progress(char *s, unsigned short hex) int width, *p; char *os; static int display_character, set_indicator; - static int max_width; + static int display_width, display_lines, *row_width, form_feed; static DEFINE_SPINLOCK(progress_lock); + static int current_line; static int pending_newline = 0; /* did last write end with unprinted newline? */ if (!rtas.base) return; - if (max_width == 0) { - if ((root = find_path_device("/rtas")) && - (p = (unsigned int *)get_property(root, - "ibm,display-line-length", - NULL))) - max_width = *p; - else - max_width = 0x10; + if (display_width == 0) { + display_width = 0x10; + if ((root = find_path_device("/rtas"))) { + if ((p = (unsigned int *)get_property(root, + "ibm,display-line-length", NULL))) + display_width = *p; + if ((p = (unsigned int *)get_property(root, + "ibm,form-feed", NULL))) + form_feed = *p; + if ((p = (unsigned int *)get_property(root, + "ibm,display-number-of-lines", NULL))) + display_lines = *p; + row_width = (unsigned int *)get_property(root, + "ibm,display-truncation-length", NULL); + } display_character = rtas_token("display-character"); set_indicator = rtas_token("set-indicator"); } @@ -131,31 +139,39 @@ rtas_progress(char *s, unsigned short hex) * it would just clear the bottom line of output. Print it now * instead. * - * If no newline is pending, print a CR to start output at the - * beginning of the line. + * If no newline is pending and form feed is supported, clear the + * display with a form feed; otherwise, print a CR to start output + * at the beginning of the line. */ if (pending_newline) { rtas_call(display_character, 1, 1, NULL, '\r'); rtas_call(display_character, 1, 1, NULL, '\n'); pending_newline = 0; } else { - rtas_call(display_character, 1, 1, NULL, '\r'); + current_line = 0; + if (form_feed) + rtas_call(display_character, 1, 1, NULL, + (char)form_feed); + else + rtas_call(display_character, 1, 1, NULL, '\r'); } - width = max_width; + if (row_width) + width = row_width[current_line]; + else + width = display_width; os = s; while (*os) { if (*os == '\n' || *os == '\r') { - /* Blank to end of line. */ - while (width-- > 0) - rtas_call(display_character, 1, 1, NULL, ' '); - /* If newline is the last character, save it * until next call to avoid bumping up the * display output. */ if (*os == '\n' && !os[1]) { pending_newline = 1; + current_line++; + if (current_line > display_lines-1) + current_line = display_lines-1; spin_unlock(&progress_lock); return; } @@ -172,7 +188,10 @@ rtas_progress(char *s, unsigned short hex) rtas_call(display_character, 1, 1, NULL, *os); } - width = max_width; + if (row_width) + width = row_width[current_line]; + else + width = display_width; } else { width--; rtas_call(display_character, 1, 1, NULL, *os); @@ -186,10 +205,6 @@ rtas_progress(char *s, unsigned short hex) os++; } - /* Blank to end of line. */ - while (width-- > 0) - rtas_call(display_character, 1, 1, NULL, ' '); - spin_unlock(&progress_lock); } diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 10222008fe20..8a1ca695f8a7 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -1080,11 +1080,11 @@ void __init setup_arch(char **cmdline_p) static void ppc64_do_msg(unsigned int src, const char *msg) { if (ppc_md.progress) { - char buf[32]; + char buf[128]; - sprintf(buf, "%08x \n", src); + sprintf(buf, "%08X\n", src); ppc_md.progress(buf, 0); - sprintf(buf, "%-16s", msg); + snprintf(buf, 128, "%s", msg); ppc_md.progress(buf, 0); } } From dad32bbf43b496bcd32a83f73a1e7fd0a02cfd3e Mon Sep 17 00:00:00 2001 From: John Rose Date: Thu, 23 Jun 2005 17:09:54 +1000 Subject: [PATCH 031/199] [PATCH] pSeries - read irqs dynamically For I/O DLPAR to work properly, the kernel needs to allow for dynamic assignment of the irq field of the pci_dev structure upon dynamic bus addition. This patch moves the assignment of that field from pSeries_final_fixup() to pcibios_fixup_bus(), which enables dynamic assignment for the children of a newly added bus. Currently, pci_devs receive their irq numbers in one of two ways. The irq line is either read at boot for all pci_devs, or read by the rpaphp module at slot enable time. The latter is no longer sufficient for DLPAR addition of slots that don't qualify as PCI-hotplug capable. This solution handles the cases of boot and dynamic add. Signed-off-by: John Rose Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/pSeries_pci.c | 35 ++++++++++++++++++------------- arch/ppc64/kernel/pSeries_setup.c | 3 +-- arch/ppc64/kernel/pci.c | 3 +++ arch/ppc64/kernel/pci.h | 6 +++++- include/asm-ppc64/machdep.h | 1 + 5 files changed, 30 insertions(+), 18 deletions(-) diff --git a/arch/ppc64/kernel/pSeries_pci.c b/arch/ppc64/kernel/pSeries_pci.c index dfa6d3d3e9f0..1f5f141fb7a1 100644 --- a/arch/ppc64/kernel/pSeries_pci.c +++ b/arch/ppc64/kernel/pSeries_pci.c @@ -32,7 +32,7 @@ #include "pci.h" -static int __initdata s7a_workaround; +static int __initdata s7a_workaround = -1; #if 0 void pcibios_name_device(struct pci_dev *dev) @@ -65,6 +65,7 @@ static void __init check_s7a(void) struct device_node *root; char *model; + s7a_workaround = 0; root = of_find_node_by_path("/"); if (root) { model = get_property(root, "model", NULL); @@ -74,6 +75,24 @@ static void __init check_s7a(void) } } +void __devinit pSeries_irq_bus_setup(struct pci_bus *bus) +{ + struct pci_dev *dev; + + if (s7a_workaround < 0) + check_s7a(); + list_for_each_entry(dev, &bus->devices, bus_list) { + pci_read_irq_line(dev); + if (s7a_workaround) { + if (dev->irq > 16) { + dev->irq -= 3; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, + dev->irq); + } + } + } +} + static void __init pSeries_request_regions(void) { if (!isa_io_base) @@ -89,20 +108,6 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { - struct pci_dev *dev = NULL; - - check_s7a(); - - for_each_pci_dev(dev) { - pci_read_irq_line(dev); - if (s7a_workaround) { - if (dev->irq > 16) { - dev->irq -= 3; - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); - } - } - } - phbs_remap_io(); pSeries_request_regions(); diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 41e6de2c9158..f2b41243342c 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -71,8 +71,6 @@ #define DBG(fmt...) #endif -extern void pSeries_final_fixup(void); - extern void find_udbg_vterm(void); extern void system_reset_fwnmi(void); /* from head.S */ extern void machine_check_fwnmi(void); /* from head.S */ @@ -425,6 +423,7 @@ struct machdep_calls __initdata pSeries_md = { .get_cpuinfo = pSeries_get_cpuinfo, .log_error = pSeries_log_error, .pcibios_fixup = pSeries_final_fixup, + .irq_bus_setup = pSeries_irq_bus_setup, .restart = rtas_restart, .power_off = rtas_power_off, .halt = rtas_halt, diff --git a/arch/ppc64/kernel/pci.c b/arch/ppc64/kernel/pci.c index 2bf0513f3eca..580676f87d23 100644 --- a/arch/ppc64/kernel/pci.c +++ b/arch/ppc64/kernel/pci.c @@ -902,6 +902,9 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) ppc_md.iommu_dev_setup(dev); + if (ppc_md.irq_bus_setup) + ppc_md.irq_bus_setup(bus); + if (!pci_probe_only) return; diff --git a/arch/ppc64/kernel/pci.h b/arch/ppc64/kernel/pci.h index 0fd7d849aa77..26be78b13af1 100644 --- a/arch/ppc64/kernel/pci.h +++ b/arch/ppc64/kernel/pci.h @@ -40,10 +40,14 @@ struct device_node *fetch_dev_dn(struct pci_dev *dev); void pci_addr_cache_insert_device(struct pci_dev *dev); void pci_addr_cache_remove_device(struct pci_dev *dev); -/* From pSeries_pci.h */ +/* From rtas_pci.h */ void init_pci_config_tokens (void); unsigned long get_phb_buid (struct device_node *); +/* From pSeries_pci.h */ +extern void pSeries_final_fixup(void); +extern void pSeries_irq_bus_setup(struct pci_bus *bus); + extern unsigned long pci_probe_only; extern unsigned long pci_assign_all_buses; extern int pci_read_irq_line(struct pci_dev *pci_dev); diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index 5d3cd9d042e2..553b2ea23bed 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -76,6 +76,7 @@ struct machdep_calls { void (*tce_flush)(struct iommu_table *tbl); void (*iommu_dev_setup)(struct pci_dev *dev); void (*iommu_bus_setup)(struct pci_bus *bus); + void (*irq_bus_setup)(struct pci_bus *bus); int (*probe)(int platform); void (*setup_arch)(void); From d7152fe14cad075d6dd4ee4194acd131aed0244e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 23 Jun 2005 17:14:39 +1000 Subject: [PATCH 032/199] [PATCH] Maple powerdown patch Currently reset and powerdown are not implemented on the Maple board, and attempting to do so will (incorrectly return). This implements the proper communication with the service processor, allowing correct reset and powerdown on the Maple board, by communicating with the service processor. If somehow it's unable to communicate with the service processor it will loop forever instead. Note that powerdown on the Maple will power down the CPUs, but not the fans or other board components due to hardware and firmware limitations. Signed-off-by: David Gibson Signed-off-by: Frank Rowand Signed-off-by: Paul Mackerras --- arch/ppc64/kernel/maple_setup.c | 62 ++++++++++++++++++++++++++++++++- arch/ppc64/kernel/setup.c | 18 ++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/arch/ppc64/kernel/maple_setup.c b/arch/ppc64/kernel/maple_setup.c index 3dabedb8553b..da8900b51f40 100644 --- a/arch/ppc64/kernel/maple_setup.c +++ b/arch/ppc64/kernel/maple_setup.c @@ -78,17 +78,77 @@ extern int maple_pci_get_legacy_ide_irq(struct pci_dev *dev, int channel); extern void generic_find_legacy_serial_ports(u64 *physport, unsigned int *default_speed); - static void maple_restart(char *cmd) { + unsigned int maple_nvram_base; + unsigned int maple_nvram_offset; + unsigned int maple_nvram_command; + struct device_node *rtcs; + + /* find NVRAM device */ + rtcs = find_compatible_devices("nvram", "AMD8111"); + if (rtcs && rtcs->addrs) { + maple_nvram_base = rtcs->addrs[0].address; + } else { + printk(KERN_EMERG "Maple: Unable to find NVRAM\n"); + printk(KERN_EMERG "Maple: Manual Restart Required\n"); + return; + } + + /* find service processor device */ + rtcs = find_devices("service-processor"); + if (!rtcs) { + printk(KERN_EMERG "Maple: Unable to find Service Processor\n"); + printk(KERN_EMERG "Maple: Manual Restart Required\n"); + return; + } + maple_nvram_offset = *(unsigned int*) get_property(rtcs, + "restart-addr", NULL); + maple_nvram_command = *(unsigned int*) get_property(rtcs, + "restart-value", NULL); + + /* send command */ + outb_p(maple_nvram_command, maple_nvram_base + maple_nvram_offset); + for (;;) ; } static void maple_power_off(void) { + unsigned int maple_nvram_base; + unsigned int maple_nvram_offset; + unsigned int maple_nvram_command; + struct device_node *rtcs; + + /* find NVRAM device */ + rtcs = find_compatible_devices("nvram", "AMD8111"); + if (rtcs && rtcs->addrs) { + maple_nvram_base = rtcs->addrs[0].address; + } else { + printk(KERN_EMERG "Maple: Unable to find NVRAM\n"); + printk(KERN_EMERG "Maple: Manual Power-Down Required\n"); + return; + } + + /* find service processor device */ + rtcs = find_devices("service-processor"); + if (!rtcs) { + printk(KERN_EMERG "Maple: Unable to find Service Processor\n"); + printk(KERN_EMERG "Maple: Manual Power-Down Required\n"); + return; + } + maple_nvram_offset = *(unsigned int*) get_property(rtcs, + "power-off-addr", NULL); + maple_nvram_command = *(unsigned int*) get_property(rtcs, + "power-off-value", NULL); + + /* send command */ + outb_p(maple_nvram_command, maple_nvram_base + maple_nvram_offset); + for (;;) ; } static void maple_halt(void) { + maple_power_off(); } #ifdef CONFIG_SMP diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 8a1ca695f8a7..7d060ddb5e93 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -683,6 +683,12 @@ void machine_restart(char *cmd) if (ppc_md.nvram_sync) ppc_md.nvram_sync(); ppc_md.restart(cmd); +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + printk(KERN_EMERG "System Halted, OK to turn off power\n"); + local_irq_disable(); + while (1) ; } EXPORT_SYMBOL(machine_restart); @@ -692,6 +698,12 @@ void machine_power_off(void) if (ppc_md.nvram_sync) ppc_md.nvram_sync(); ppc_md.power_off(); +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + printk(KERN_EMERG "System Halted, OK to turn off power\n"); + local_irq_disable(); + while (1) ; } EXPORT_SYMBOL(machine_power_off); @@ -701,6 +713,12 @@ void machine_halt(void) if (ppc_md.nvram_sync) ppc_md.nvram_sync(); ppc_md.halt(); +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + printk(KERN_EMERG "System Halted, OK to turn off power\n"); + local_irq_disable(); + while (1) ; } EXPORT_SYMBOL(machine_halt); From b2b3d8247951f298897b395599849957ee271c55 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Thu, 23 Jun 2005 03:41:00 -0400 Subject: [PATCH 033/199] e1000: fix spinlock bug This patch fixes an obvious and nasty bug where we could exit the transmit routine while holding tx_lock. Signed-off-by: Mitch Williams --- drivers/net/e1000/e1000_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 325495b8b60c..137226d98d47 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2307,6 +2307,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) tso = e1000_tso(adapter, skb); if (tso < 0) { dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&adapter->tx_lock, flags); return NETDEV_TX_OK; } From 4ba5e35daa90871fcb9b01f5ad1e5723343cc0a9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Jun 2005 10:43:04 +0100 Subject: [PATCH 034/199] [PATCH] Serial: Convert 8250 revision-based bug fixes to bug bitmask For some 8250 port types, we used to check the type of the port, and then determine whether the chip revision means the device is buggy. Instead, introduce a bit array, and set the appropriate bit(s) when we discover a buggy device. Signed-off-by: Russell King --- drivers/serial/8250.c | 23 +++++++++++++---------- drivers/serial/8250.h | 2 ++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 30e8beb71430..27cc288e91d0 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -132,9 +132,9 @@ struct uart_8250_port { struct uart_port port; struct timer_list timer; /* "no irq" timer */ struct list_head list; /* ports on this IRQ */ - unsigned int capabilities; /* port capabilities */ + unsigned short capabilities; /* port capabilities */ + unsigned short bugs; /* port bugs */ unsigned int tx_loadsz; /* transmit fifo load size */ - unsigned short rev; unsigned char acr; unsigned char ier; unsigned char lcr; @@ -560,7 +560,14 @@ static void autoconfig_has_efr(struct uart_8250_port *up) if (id1 == 0x16 && id2 == 0xC9 && (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { up->port.type = PORT_16C950; - up->rev = rev | (id3 << 8); + + /* + * Enable work around for the Oxford Semiconductor 952 rev B + * chip which causes it to seriously miscalculate baud rates + * when DLL is 0. + */ + if (id3 == 0x52 && rev == 0x01) + up->bugs |= UART_BUG_QUOT; return; } @@ -577,8 +584,6 @@ static void autoconfig_has_efr(struct uart_8250_port *up) id2 = id1 >> 8; if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { - if (id2 == 0x10) - up->rev = id1 & 255; up->port.type = PORT_16850; return; } @@ -809,6 +814,7 @@ static void autoconfig(struct uart_8250_port *up, unsigned int probeflags) // save_flags(flags); cli(); up->capabilities = 0; + up->bugs = 0; if (!(up->port.flags & UPF_BUGGY_UART)) { /* @@ -1677,12 +1683,9 @@ serial8250_set_termios(struct uart_port *port, struct termios *termios, quot = serial8250_get_divisor(port, baud); /* - * Work around a bug in the Oxford Semiconductor 952 rev B - * chip which causes it to seriously miscalculate baud rates - * when DLL is 0. + * Oxford Semi 952 rev B workaround */ - if ((quot & 0xff) == 0 && up->port.type == PORT_16C950 && - up->rev == 0x5201) + if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) quot ++; if (up->capabilities & UART_CAP_FIFO && up->port.fifosize > 1) { diff --git a/drivers/serial/8250.h b/drivers/serial/8250.h index 4f3d62f222f4..cd5c3dd2d910 100644 --- a/drivers/serial/8250.h +++ b/drivers/serial/8250.h @@ -51,6 +51,8 @@ struct serial8250_config { #define UART_CAP_AFE (1 << 11) /* MCR-based hw flow control */ #define UART_CAP_UUE (1 << 12) /* UART needs IER bit 6 set (Xscale) */ +#define UART_BUG_QUOT (1 << 0) /* UART has buggy quot LSB */ + #if defined(__i386__) && (defined(CONFIG_M386) || defined(CONFIG_M486)) #define _INLINE_ inline #else From 55d3b282b90620e02e825304a9433732a84c58a5 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Jun 2005 15:05:41 +0100 Subject: [PATCH 035/199] [PATCH] Serial: Mobility's 16550A ports need a helping hand The Mobility 16550A serial ports don't behave the same as standard 16550A ports, and need a helping hand to get them going once the transmitter has drained and been disabled. Signed-off-by: Russell King --- drivers/serial/8250.c | 31 +++++++++++++++++++++++++++++++ drivers/serial/8250.h | 1 + 2 files changed, 32 insertions(+) diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 27cc288e91d0..341c644591ae 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -1027,6 +1027,8 @@ static void serial8250_stop_tx(struct uart_port *port, unsigned int tty_stop) } } +static void transmit_chars(struct uart_8250_port *up); + static void serial8250_start_tx(struct uart_port *port, unsigned int tty_start) { struct uart_8250_port *up = (struct uart_8250_port *)port; @@ -1034,6 +1036,14 @@ static void serial8250_start_tx(struct uart_port *port, unsigned int tty_start) if (!(up->ier & UART_IER_THRI)) { up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); + + if (up->capabilities & UART_BUG_TXEN) { + unsigned char lsr, iir; + lsr = serial_in(up, UART_LSR); + iir = serial_in(up, UART_IIR); + if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) + transmit_chars(up); + } } /* * We only do this from uart_start @@ -1439,6 +1449,7 @@ static int serial8250_startup(struct uart_port *port) { struct uart_8250_port *up = (struct uart_8250_port *)port; unsigned long flags; + unsigned char lsr, iir; int retval; up->capabilities = uart_config[up->port.type].flags; @@ -1542,6 +1553,26 @@ static int serial8250_startup(struct uart_port *port) up->port.mctrl |= TIOCM_OUT2; serial8250_set_mctrl(&up->port, up->port.mctrl); + + /* + * Do a quick test to see if we receive an + * interrupt when we enable the TX irq. + */ + serial_outp(up, UART_IER, UART_IER_THRI); + lsr = serial_in(up, UART_LSR); + iir = serial_in(up, UART_IIR); + serial_outp(up, UART_IER, 0); + + if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { + if (!(up->capabilities & UART_BUG_TXEN)) { + up->capabilities |= UART_BUG_TXEN; + pr_debug("ttyS%d - enabling bad tx status workarounds\n", + port->line); + } + } else { + up->capabilities &= ~UART_BUG_TXEN; + } + spin_unlock_irqrestore(&up->port.lock, flags); /* diff --git a/drivers/serial/8250.h b/drivers/serial/8250.h index cd5c3dd2d910..9225c82faeb8 100644 --- a/drivers/serial/8250.h +++ b/drivers/serial/8250.h @@ -52,6 +52,7 @@ struct serial8250_config { #define UART_CAP_UUE (1 << 12) /* UART needs IER bit 6 set (Xscale) */ #define UART_BUG_QUOT (1 << 0) /* UART has buggy quot LSB */ +#define UART_BUG_TXEN (1 << 1) /* UART has buggy TX IIR status */ #if defined(__i386__) && (defined(CONFIG_M386) || defined(CONFIG_M486)) #define _INLINE_ inline From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:37 -0700 Subject: [PATCH 036/199] [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map This patch effectively eliminates direct use of pgdat->node_mem_map outside of the DISCONTIG code. On a flat memory system, these fields aren't currently used, neither are they on a sparsemem system. There was also a node_mem_map(nid) macro on many architectures. Its use along with the use of ->node_mem_map itself was not consistent. It has been removed in favor of two new, more explicit, arch-independent macros: pgdat_page_nr(pgdat, pagenr) nid_page_nr(nid, pagenr) I called them "pgdat" and "nid" because we overload the term "node" to mean "NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I believe the newer names are much clearer. These macros can be overridden in the sparsemem case with a theoretically slower operation using node_start_pfn and pfn_to_page(), instead. We could make this the only behavior if people want, but I don't want to change too much at once. One thing at a time. This patch removes more code than it adds. Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386 generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/ Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs. Signed-off-by: Dave Hansen Signed-off-by: Martin J. Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/numa.c | 16 +++++++--------- arch/i386/mm/pgtable.c | 2 +- arch/ia64/mm/discontig.c | 9 +++++---- arch/m32r/mm/init.c | 4 ++-- arch/mips/sgi-ip27/ip27-memory.c | 5 ++--- arch/parisc/mm/init.c | 2 +- arch/ppc64/mm/init.c | 4 ++-- include/asm-alpha/mmzone.h | 3 +-- include/asm-i386/mmzone.h | 3 +-- include/asm-m32r/mmzone.h | 3 +-- include/asm-parisc/mmzone.h | 3 +-- include/asm-ppc64/mmzone.h | 3 +-- include/asm-x86_64/mmzone.h | 5 +---- include/linux/mmzone.h | 2 ++ 14 files changed, 28 insertions(+), 36 deletions(-) diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index ba81c4422aaf..c7481d59b6df 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -327,8 +327,6 @@ void __init mem_init(void) extern char _text, _etext, _data, _edata; extern char __init_begin, __init_end; unsigned long nid, i; - struct page * lmem_map; - high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); reservedpages = 0; @@ -338,10 +336,10 @@ void __init mem_init(void) */ totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); - lmem_map = node_mem_map(nid); pfn = NODE_DATA(nid)->node_start_pfn; for (i = 0; i < node_spanned_pages(nid); i++, pfn++) - if (page_is_ram(pfn) && PageReserved(lmem_map+i)) + if (page_is_ram(pfn) && + PageReserved(nid_page_nr(nid, i))) reservedpages++; } @@ -373,18 +371,18 @@ show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { - struct page * lmem_map = node_mem_map(nid); i = node_spanned_pages(nid); while (i-- > 0) { + struct page *page = nid_page_nr(nid, i); total++; - if (PageReserved(lmem_map+i)) + if (PageReserved(page)) reserved++; - else if (PageSwapCache(lmem_map+i)) + else if (PageSwapCache(page)) cached++; - else if (!page_count(lmem_map+i)) + else if (!page_count(page)) free++; else - shared += page_count(lmem_map + i) - 1; + shared += page_count(page) - 1; } } printk("%ld pages of RAM\n",total); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index dd81479ff88a..80c84cdf22ef 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -36,7 +36,7 @@ void show_mem(void) printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { - page = pgdat->node_mem_map + i; + page = pgdat_page_nr(pgdat, i); total++; if (PageHighMem(page)) highmem++; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c00710929390..f3fd528ead3b 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -560,14 +560,15 @@ void show_mem(void) int shared = 0, cached = 0, reserved = 0; printk("Node ID: %d\n", pgdat->node_id); for(i = 0; i < pgdat->node_spanned_pages; i++) { + struct page *page = pgdat_page_nr(pgdat, i); if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) continue; - if (PageReserved(pgdat->node_mem_map+i)) + if (PageReserved(page)) reserved++; - else if (PageSwapCache(pgdat->node_mem_map+i)) + else if (PageSwapCache(page)) cached++; - else if (page_count(pgdat->node_mem_map+i)) - shared += page_count(pgdat->node_mem_map+i)-1; + else if (page_count(page)) + shared += page_count(page)-1; } total_present += present; total_reserved += reserved; diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index bc423d838fb8..d9a40b1fe8ba 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -49,7 +49,7 @@ void show_mem(void) printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { - page = pgdat->node_mem_map + i; + page = pgdat_page_nr(pgdat, i); total++; if (PageHighMem(page)) highmem++; @@ -152,7 +152,7 @@ int __init reservedpages_count(void) reservedpages = 0; for_each_online_node(nid) for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) - if (PageReserved(NODE_DATA(nid)->node_mem_map + i)) + if (PageReserved(nid_page_nr(nid, i))) reservedpages++; return reservedpages; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 0a44a98d7adc..a160d04f7dbe 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -549,9 +549,8 @@ void __init mem_init(void) */ numslots = node_getlastslot(node); for (slot = 1; slot <= numslots; slot++) { - p = NODE_DATA(node)->node_mem_map + - (slot_getbasepfn(node, slot) - - slot_getbasepfn(node, 0)); + p = nid_page_nr(node, slot_getbasepfn(node, slot) - + slot_getbasepfn(node, 0)); /* * Free valid memory in current slot. diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index cac37589e35c..2886ad70db48 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -506,7 +506,7 @@ void show_mem(void) for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; - p = node_mem_map(i) + j - node_start_pfn(i); + p = nid_page_nr(i, j) - node_start_pfn(i); total++; if (PageReserved(p)) diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 6fa1e6490b57..29dbe084c21f 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -98,7 +98,7 @@ void show_mem(void) printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; i++) { - page = pgdat->node_mem_map + i; + page = pgdat_page_nr(pgdat, i); total++; if (PageReserved(page)) reserved++; @@ -654,7 +654,7 @@ void __init mem_init(void) for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; i++) { - page = pgdat->node_mem_map + i; + page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index 726c150dcbe4..a011ef4cf3d3 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -57,7 +57,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) * Given a kernel address, find the home node of the underlying memory. */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define local_mapnr(kvaddr) \ @@ -108,7 +107,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) #define pfn_to_page(pfn) \ ({ \ unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT); \ - (node_mem_map(kvaddr_to_nid(kaddr)) + local_mapnr(kaddr)); \ + (NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr)); \ }) #define page_to_pfn(page) \ diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 13830ae67cac..9cec191f462c 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -79,7 +79,6 @@ static inline int pfn_to_nid(unsigned long pfn) */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -100,7 +99,7 @@ static inline int pfn_to_nid(unsigned long pfn) ({ \ unsigned long __pfn = pfn; \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h index ebf0228fec42..d58878ec899e 100644 --- a/include/asm-m32r/mmzone.h +++ b/include/asm-m32r/mmzone.h @@ -14,7 +14,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -32,7 +31,7 @@ extern struct pglist_data *node_data[]; ({ \ unsigned long __pfn = pfn; \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h index 928bf50c4693..595d3dce120a 100644 --- a/include/asm-parisc/mmzone.h +++ b/include/asm-parisc/mmzone.h @@ -19,7 +19,6 @@ extern struct node_map_data node_data[]; */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -38,7 +37,7 @@ extern struct node_map_data node_data[]; ({ \ unsigned long __pfn = (pfn); \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index 0619a41a3c9d..cbfc5ecfe875 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -65,7 +65,6 @@ static inline int pa_to_nid(unsigned long pa) */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) @@ -76,7 +75,7 @@ static inline int pa_to_nid(unsigned long pa) #define discontigmem_pfn_to_page(pfn) \ ({ \ unsigned long __tmp = pfn; \ - (node_mem_map(pfn_to_nid(__tmp)) + \ + (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ node_localnr(__tmp, pfn_to_nid(__tmp))); \ }) diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index d95b7c240831..ca4fc3fe0dee 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -35,9 +35,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) #define NODE_DATA(nid) (node_data[nid]) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) - -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) @@ -50,7 +47,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) (2.4 used to). */ #define pfn_to_page(pfn) ({ \ int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); \ - ((pfn) - node_start_pfn(nid)) + node_mem_map(nid); \ + ((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; \ }) #define page_to_pfn(page) \ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4733d35d8223..b79633d3a97b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -284,6 +284,8 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; From c2ebaa425e6630adcbf757b004d257dd4204925b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:38 -0700 Subject: [PATCH 037/199] [PATCH] sparsemem base: early_pfn_to_nid() (works before sparse is initialized) The following four patches provide the last needed changes before the introduction of sparsemem. For a more complete description of what this will do, please see this patch: http://www.sr71.net/patches/2.6.11/2.6.11-bk7-mhp1/broken-out/B-sparse-150-sparsemem.patch or previous posts on the subject: http://marc.theaimsgroup.com/?t=110868540700001&r=1&w=2 http://marc.theaimsgroup.com/?l=linux-mm&m=109897373315016&w=2 Three of these are i386-only, but one of them reorganizes the macros used to manage the space in page->flags, and will affect all platforms. There are analogous patches to the i386 ones for ppc64, ia64, and x86_64, but those will be submitted by the normal arch maintainers. The combination of the four patches has been test-booted on a variety of i386 hardware, and compiled for ppc64, i386, and x86-64 with about 17 different .configs. It's also been runtime-tested on ia64 configs (with more patches on top). This patch: We _know_ which node pages in general belong to, at least at a very gross level in node_{start,end}_pfn[]. Use those to target the allocations of pages. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/discontig.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 1726b4096b10..85d2fcbe1079 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -146,6 +146,21 @@ static void __init find_max_pfn_node(int nid) BUG(); } +/* Find the owning node for a pfn. */ +int early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + for_each_node(nid) { + if (node_end_pfn[nid] == 0) + break; + if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) + return nid; + } + + return 0; +} + /* * Allocate memory for the pg_data_t for this node via a crude pre-bootmem * method. For node zero take this from the bottom of memory, for From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:39 -0700 Subject: [PATCH 038/199] [PATCH] sparsemem base: simple NUMA remap space allocator Introduce a simple allocator for the NUMA remap space. This space is very scarce, used for structures which are best allocated node local. This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep the pgdat->node_mem_map initialized in a consistent place for all architectures. Issues: o alloc_remap takes a node_id where we might expect a pgdat which was intended to allow us to allocate the pgdat's using this mechanism; which we do not yet do. Could have alloc_remap_node() and alloc_remap_nid() for this purpose. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 5 ++++ arch/i386/mm/discontig.c | 59 +++++++++++++++++++++------------------- include/linux/bootmem.h | 9 ++++++ mm/page_alloc.c | 6 +++- 4 files changed, 50 insertions(+), 29 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index dfd904f6883b..35ca3a17ed20 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -803,6 +803,11 @@ config NEED_NODE_MEMMAP_SIZE depends on DISCONTIGMEM default y +config HAVE_ARCH_ALLOC_REMAP + bool + depends on NUMA + default y + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM4G || HIGHMEM64G diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 85d2fcbe1079..dcc71f969b01 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -108,6 +108,9 @@ unsigned long node_remap_offset[MAX_NUMNODES]; void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); +void *node_remap_end_vaddr[MAX_NUMNODES]; +void *node_remap_alloc_vaddr[MAX_NUMNODES]; + /* * FLAT - support for basic PC memory model with discontig enabled, essentially * a single node with all available processors in it with a flat @@ -178,6 +181,21 @@ static void __init allocate_pgdat(int nid) } } +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + return 0; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + void __init remap_numa_kva(void) { void *vaddr; @@ -185,8 +203,6 @@ void __init remap_numa_kva(void) int node; for_each_online_node(node) { - if (node == 0) - continue; for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { vaddr = node_remap_start_vaddr[node]+(pfn<node_mem_map = (struct page *)lmem_map; - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start, zholes_size); - } + + free_area_init_node(nid, NODE_DATA(nid), zones_size, start, + zholes_size); } return; } diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0dd8ca1a3d5a..500f451ce0c0 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP +extern void *alloc_remap(int nid, unsigned long size); +#else +static inline void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} +#endif + extern unsigned long __initdata nr_kernel_pages; extern unsigned long __initdata nr_all_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 559336de9687..bf1dd8819097 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1936,6 +1936,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long size; + struct page *map; /* Skip empty nodes */ if (!pgdat->node_spanned_pages) @@ -1944,7 +1945,10 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = alloc_bootmem_node(pgdat, size); + pgdat->node_mem_map = map; } #ifndef CONFIG_DISCONTIGMEM /* From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:40 -0700 Subject: [PATCH 039/199] [PATCH] sparsemem base: reorganize page->flags bit operations Generify the value fields in the page_flags. The aim is to allow the location and size of these fields to be varied. Additionally we want to move away from fixed allocations per field whilst still enforcing the overall bit utilisation limits. We rely on the compiler to spot and optimise the accessor functions. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 53 +++++++++++++++++++++++++++++++++++------- include/linux/mmzone.h | 19 ++++++--------- mm/page_alloc.c | 2 +- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1813b162b0a8..57b2ead51dba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -395,19 +395,41 @@ static inline void put_page(struct page *page) /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. - * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, - * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. */ -#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) + +/* Page flags: | NODE | ZONE | ... | FLAGS | */ +#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * Define the bit shifts to access each section. For non-existant + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) + +/* NODE:ZONE is used to lookup the zone from a page. */ +#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONETABLE_PGSHIFT ZONES_PGSHIFT + +#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#endif + #define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) +#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) +#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) + static inline unsigned long page_zonenum(struct page *page) { - return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline unsigned long page_to_nid(struct page *page) { - return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } struct zone; @@ -415,13 +437,26 @@ extern struct zone *zone_table[]; static inline struct zone *page_zone(struct page *page) { - return zone_table[page->flags >> NODEZONE_SHIFT]; + return zone_table[(page->flags >> ZONETABLE_PGSHIFT) & + ZONETABLE_MASK]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone) +{ + page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; +} +static inline void set_page_node(struct page *page, unsigned long node) +{ + page->flags &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } -static inline void set_page_zone(struct page *page, unsigned long nodezone_num) +static inline void set_page_links(struct page *page, unsigned long zone, + unsigned long node) { - page->flags &= ~(~0UL << NODEZONE_SHIFT); - page->flags |= nodezone_num << NODEZONE_SHIFT; + set_page_zone(page, zone); + set_page_node(page, node); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b79633d3a97b..39e912708e2a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data; #include +#endif /* !CONFIG_DISCONTIGMEM */ + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define MAX_NODES_SHIFT 6 +#define FLAGS_RESERVED 8 + #elif BITS_PER_LONG == 64 /* * with 64 bit flags field, there's plenty of room. */ -#define MAX_NODES_SHIFT 10 -#endif +#define FLAGS_RESERVED 32 -#endif /* !CONFIG_DISCONTIGMEM */ - -#if NODES_SHIFT > MAX_NODES_SHIFT -#error NODES_SHIFT > MAX_NODES_SHIFT -#endif +#else -/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ -#define MAX_ZONES_SHIFT 2 +#error BITS_PER_LONG not defined -#if ZONES_SHIFT > MAX_ZONES_SHIFT -#error ZONES_SHIFT > MAX_ZONES_SHIFT #endif #endif /* !__ASSEMBLY__ */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf1dd8819097..1958358e29b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1653,7 +1653,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, struct page *page; for (page = start; page < (start + size); page++) { - set_page_zone(page, NODEZONE(nid, zone)); + set_page_links(page, zone, nid); set_page_count(page, 0); reset_page_mapcount(page); SetPageReserved(page); From 5b505b90b2d54e526cc8d123bdef3b98c9f0bbc6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:41 -0700 Subject: [PATCH 040/199] [PATCH] sparsemem base: teach discontig about sparse ranges discontig.c has some assumptions that mem_map[]s inside of a node are contiguous. Teach it to make sure that each region that it's bringing online is actually made up of valid ranges of ram. Written-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/discontig.c | 14 ++++++++++++++ arch/i386/mm/init.c | 2 +- include/asm-i386/page.h | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index dcc71f969b01..088ca4722183 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -216,6 +216,7 @@ static unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; + unsigned long pfn; for_each_online_node(nid) { /* @@ -234,6 +235,19 @@ static unsigned long calculate_numa_remap_pages(void) size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; + + /* + * Validate the region we are allocating only contains valid + * pages. + */ + for (pfn = node_end_pfn[nid] - size; + pfn < node_end_pfn[nid]; pfn++) + if (!page_is_ram(pfn)) + break; + + if (pfn != node_end_pfn[nid]) + size = 0; + printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 8766c771bb45..666ca79fb50a 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -191,7 +191,7 @@ static inline int page_kills_ppro(unsigned long pagenr) extern int is_available_memory(efi_memory_desc_t *); -static inline int page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 41400d342d44..8f3dded01bff 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -120,6 +120,8 @@ static __inline__ int get_order(unsigned long size) extern int sysctl_legacy_va_layout; +extern int page_is_ram(unsigned long pagenr); + #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ From 3a9da7655d2d5b7f790a370328cf093440c80496 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:42 -0700 Subject: [PATCH 041/199] [PATCH] create mm/Kconfig for arch-independent memory options With sparsemem being introduced, we need a central place for new memory-related .config options: mm/Kconfig. This allows us to remove many of the duplicated arch-specific options. The new option, CONFIG_FLATMEM, is there to enable us to detangle NUMA and DISCONTIGMEM. This is a requirement for sparsemem because sparsemem uses the NUMA code without the presence of DISCONTIGMEM. The sparsemem patches use CONFIG_FLATMEM in generic code, so this patch is a requirement before applying them. Almost all places that used to do '#ifndef CONFIG_DISCONTIGMEM' should use '#ifdef CONFIG_FLATMEM' instead. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 mm/Kconfig diff --git a/mm/Kconfig b/mm/Kconfig new file mode 100644 index 000000000000..69caa9d8674e --- /dev/null +++ b/mm/Kconfig @@ -0,0 +1,25 @@ +choice + prompt "Memory model" + default FLATMEM + default SPARSEMEM if ARCH_SPARSEMEM_DEFAULT + default DISCONTIGMEM if ARCH_DISCONTIGMEM_DEFAULT + +config FLATMEM + bool "Flat Memory" + depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here: FLATMEM. This is normal + and a correct option. + + If unsure, choose this option over any other. + +config DISCONTIGMEM + bool "Discontigious Memory" + depends on ARCH_DISCONTIGMEM_ENABLE + help + If unsure, choose "Flat Memory" over this option. + +endchoice + From 3f22ab276b931b72ea04b184c155b34d0362bfc3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:43 -0700 Subject: [PATCH 042/199] [PATCH] make each arch use mm/Kconfig For all architectures, this just means that you'll see a "Memory Model" choice in your architecture menu. For those that implement DISCONTIGMEM, you may eventually want to make your ARCH_DISCONTIGMEM_ENABLE a "def_bool y" and make your users select DISCONTIGMEM right out of the new choice menu. The only disadvantage might be if you have some specific things that you need in your help option to explain something about DISCONTIGMEM. Signed-off-by: Dave Hansen Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 4 +++- arch/arm/Kconfig | 4 +++- arch/arm26/Kconfig | 2 ++ arch/cris/Kconfig | 2 ++ arch/frv/Kconfig | 2 ++ arch/h8300/Kconfig.cpu | 3 +++ arch/i386/Kconfig | 4 +++- arch/ia64/Kconfig | 4 +++- arch/m32r/Kconfig | 4 +++- arch/m68k/Kconfig | 2 ++ arch/m68knommu/Kconfig | 2 ++ arch/mips/Kconfig | 6 +++++- arch/parisc/Kconfig | 8 +++++++- arch/ppc/Kconfig | 2 ++ arch/ppc64/Kconfig | 4 +++- arch/s390/Kconfig | 2 ++ arch/sh/Kconfig | 8 +++++++- arch/sh64/Kconfig | 2 ++ arch/sparc/Kconfig | 2 ++ arch/sparc64/Kconfig | 2 ++ arch/um/Kconfig | 1 + arch/v850/Kconfig | 2 ++ arch/x86_64/Kconfig | 4 +++- 23 files changed, 66 insertions(+), 10 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index f7c96635d3b4..c5739d6309df 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -509,7 +509,7 @@ config NR_CPUS depends on SMP default "64" -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool "Discontiguous Memory Support (EXPERIMENTAL)" depends on EXPERIMENTAL help @@ -518,6 +518,8 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +source "mm/Kconfig" + config NUMA bool "NUMA Support (EXPERIMENTAL)" depends on DISCONTIGMEM diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ee8a9ad7bbd9..07ba77c19f6c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -346,7 +346,7 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool default (ARCH_LH7A40X && !LH7A40X_CONTIGMEM) help @@ -355,6 +355,8 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +source "mm/Kconfig" + config LEDS bool "Timer and CPU usage LEDs" depends on ARCH_CDB89712 || ARCH_CO285 || ARCH_EBSA110 || \ diff --git a/arch/arm26/Kconfig b/arch/arm26/Kconfig index 6caed90661fc..dc0c1936969b 100644 --- a/arch/arm26/Kconfig +++ b/arch/arm26/Kconfig @@ -179,6 +179,8 @@ config CMDLINE time by entering them here. As a minimum, you should specify the memory size and the root device (e.g., mem=64M root=/dev/nfs). +source "mm/Kconfig" + endmenu source "drivers/base/Kconfig" diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 4332ca348d51..f848e3761491 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -74,6 +74,8 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +source mm/Kconfig + endmenu menu "Hardware setup" diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 2b19372767eb..c93f95146cc2 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -74,6 +74,8 @@ config HIGHPTE with a lot of RAM, this can be wasteful of precious low memory. Setting this option will put user-space page tables in high memory. +source "mm/Kconfig" + choice prompt "uClinux kernel load address" depends on !MMU diff --git a/arch/h8300/Kconfig.cpu b/arch/h8300/Kconfig.cpu index d9dd62a565a9..a380167a13cf 100644 --- a/arch/h8300/Kconfig.cpu +++ b/arch/h8300/Kconfig.cpu @@ -180,4 +180,7 @@ config CPU_H8S config PREEMPT bool "Preemptible Kernel" default n + +source "mm/Kconfig" + endmenu diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 35ca3a17ed20..8e5242c8e09d 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -783,7 +783,7 @@ comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool depends on NUMA default y @@ -808,6 +808,8 @@ config HAVE_ARCH_ALLOC_REMAP depends on NUMA default y +source "mm/Kconfig" + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM4G || HIGHMEM64G diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index ce4dfa8b834d..9f6a46caba9b 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -197,7 +197,7 @@ config HOLES_IN_ZONE bool default y if VIRTUAL_MEM_MAP -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool "Discontiguous memory support" depends on (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) && NUMA && VIRTUAL_MEM_MAP default y if (IA64_SGI_SN2 || IA64_GENERIC) && NUMA @@ -300,6 +300,8 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +source "mm/Kconfig" + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 64c133344afe..42ca8a39798d 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -172,11 +172,13 @@ config NOHIGHMEM bool default y -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool "Internal RAM Support" depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP default y +source "mm/Kconfig" + config IRAM_START hex "Internal memory start address (hex)" default "00f00000" diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index d0713c7d9f0a..691a2469ff36 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -357,6 +357,8 @@ config 060_WRITETHROUGH is hardwired on. The 53c710 SCSI driver is known to suffer from this problem. +source "mm/Kconfig" + endmenu menu "General setup" diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index e729bd280623..dbfcdc8e6087 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -532,6 +532,8 @@ config ROMKERNEL endchoice +source "mm/Kconfig" + endmenu config ISA_DMA_API diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ab9944693f1f..2710018ac809 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -492,7 +492,7 @@ config SGI_SN0_N_MODE which allows for more memory. Your system is most probably running in M-Mode, so you should say N here. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool default y if SGI_IP27 help @@ -501,6 +501,10 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +config ARCH_FLATMEM_DISABLE + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + config NUMA bool "NUMA Support" depends on SGI_IP27 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index e7e7c56fc212..ab7cc4eb7fc1 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -148,7 +148,7 @@ config HOTPLUG_CPU default y if SMP select HOTPLUG -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool "Discontiguous memory support (EXPERIMENTAL)" depends on EXPERIMENTAL help @@ -157,6 +157,12 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +config ARCH_FLATMEM_DISABLE + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + +source "mm/Kconfig" + config PREEMPT bool # bool "Preemptible Kernel" diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig index 10162b187bcf..848f43970a4b 100644 --- a/arch/ppc/Kconfig +++ b/arch/ppc/Kconfig @@ -905,6 +905,8 @@ config PREEMPT config HIGHMEM bool "High memory support" +source "mm/Kconfig" + source "fs/Kconfig.binfmt" config PROC_DEVICETREE diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 0f1fa289744e..87344f324233 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -198,10 +198,12 @@ config HMT This option enables hardware multithreading on RS64 cpus. pSeries systems p620 and p660 have such a cpu type. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool "Discontiguous Memory Support" depends on SMP && PPC_PSERIES +source "mm/Kconfig" + config NUMA bool "NUMA support" depends on DISCONTIGMEM diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ab79af84699a..32696c1d9280 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -226,6 +226,8 @@ config WARN_STACK_SIZE This allows you to specify the maximum frame size a function may have without the compiler complaining about it. +source "mm/Kconfig" + comment "I/O subsystem configuration" config MACHCHK_WARNING diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 3468d5127223..ac2b865359b2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -486,7 +486,7 @@ config CPU_SUBTYPE_ST40 depends on CPU_SUBTYPE_ST40STB1 || CPU_SUBTYPE_ST40GX1 default y -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool depends on SH_HP690 default y @@ -496,6 +496,12 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +config ARCH_FLATMEM_DISABLE + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + +source "mm/Kconfig" + config ZERO_PAGE_OFFSET hex "Zero page offset" default "0x00001000" if !(SH_MPC1211 || SH_SH03) diff --git a/arch/sh64/Kconfig b/arch/sh64/Kconfig index 76eb81fba45e..708e59736a4d 100644 --- a/arch/sh64/Kconfig +++ b/arch/sh64/Kconfig @@ -217,6 +217,8 @@ config PREEMPT bool "Preemptible Kernel (EXPERIMENTAL)" depends on EXPERIMENTAL +source "mm/Kconfig" + endmenu menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 237f922520fd..310d3f0c445e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -291,6 +291,8 @@ config PRINTER If you have more than 8 printers, you need to increase the LP_NO macro in lp.c and the PARPORT_MAX macro in parport.h. +source "mm/Kconfig" + endmenu source "drivers/base/Kconfig" diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index a72fd15d5ea8..e2b050eb3b96 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -484,6 +484,8 @@ config CMDLINE NOTE: This option WILL override the PROM bootargs setting! +source "mm/Kconfig" + endmenu source "drivers/base/Kconfig" diff --git a/arch/um/Kconfig b/arch/um/Kconfig index b8e952c88fd1..9469e77303e6 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -74,6 +74,7 @@ config MODE_SKAS option will shrink the UML binary slightly. source "arch/um/Kconfig_arch" +source "mm/Kconfig" config LD_SCRIPT_STATIC bool diff --git a/arch/v850/Kconfig b/arch/v850/Kconfig index 90cd4baa75ee..27febd6ffa80 100644 --- a/arch/v850/Kconfig +++ b/arch/v850/Kconfig @@ -218,6 +218,8 @@ menu "Processor type and features" a lot of RAM, and you need to able to allocate very large contiguous chunks. If unsure, say N. +source "mm/Kconfig" + endmenu diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 289f448ac89c..9f5b99e7b1eb 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -265,7 +265,7 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool depends on NUMA default y @@ -274,6 +274,8 @@ config NUMA bool default n +source "mm/Kconfig" + config HAVE_DEC_LOCK bool depends on SMP From 0e19243e9a19ef8e5994852671bd06bb51630811 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:45 -0700 Subject: [PATCH 043/199] [PATCH] update all defconfigs for ARCH_DISCONTIGMEM_ENABLE This will at least suppress one prompt that users would have received the first time they compile with the new DISCONTIG arch option. They'll still get the "Memory Model" prompt, but 99% of them will have the default work there. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/defconfig | 2 +- arch/ia64/configs/sn2_defconfig | 2 +- arch/ia64/defconfig | 2 +- arch/mips/configs/ip27_defconfig | 2 +- arch/ppc64/configs/pSeries_defconfig | 2 +- arch/ppc64/defconfig | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 5e39b7a7c8f4..6da9c3dbde44 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -96,7 +96,7 @@ CONFIG_ALPHA_CORE_AGP=y CONFIG_ALPHA_BROKEN_IRQ_MASK=y CONFIG_EISA=y # CONFIG_SMP is not set -# CONFIG_DISCONTIGMEM is not set +# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set CONFIG_VERBOSE_MCHECK=y CONFIG_VERBOSE_MCHECK_ON=1 CONFIG_PCI_LEGACY_PROC=y diff --git a/arch/ia64/configs/sn2_defconfig b/arch/ia64/configs/sn2_defconfig index a01bb02d074d..487d2e36b0a6 100644 --- a/arch/ia64/configs/sn2_defconfig +++ b/arch/ia64/configs/sn2_defconfig @@ -78,7 +78,7 @@ CONFIG_IA64_L1_CACHE_SHIFT=7 CONFIG_NUMA=y CONFIG_VIRTUAL_MEM_MAP=y CONFIG_HOLES_IN_ZONE=y -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y # CONFIG_IA64_CYCLONE is not set CONFIG_IOSAPIC=y CONFIG_IA64_SGI_SN_SIM=y diff --git a/arch/ia64/defconfig b/arch/ia64/defconfig index 7be8096e0561..8444add76380 100644 --- a/arch/ia64/defconfig +++ b/arch/ia64/defconfig @@ -84,7 +84,7 @@ CONFIG_IA64_L1_CACHE_SHIFT=7 CONFIG_NUMA=y CONFIG_VIRTUAL_MEM_MAP=y CONFIG_HOLES_IN_ZONE=y -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y CONFIG_IA64_CYCLONE=y CONFIG_IOSAPIC=y CONFIG_FORCE_MAX_ZONEORDER=18 diff --git a/arch/mips/configs/ip27_defconfig b/arch/mips/configs/ip27_defconfig index 13472292d0ec..b5bab3a42fc4 100644 --- a/arch/mips/configs/ip27_defconfig +++ b/arch/mips/configs/ip27_defconfig @@ -82,7 +82,7 @@ CONFIG_STOP_MACHINE=y # CONFIG_SGI_IP22 is not set CONFIG_SGI_IP27=y # CONFIG_SGI_SN0_N_MODE is not set -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y CONFIG_NUMA=y # CONFIG_MAPPED_KERNEL is not set # CONFIG_REPLICATE_KTEXT is not set diff --git a/arch/ppc64/configs/pSeries_defconfig b/arch/ppc64/configs/pSeries_defconfig index 3eb5ef25d3a3..d0db8b5966c0 100644 --- a/arch/ppc64/configs/pSeries_defconfig +++ b/arch/ppc64/configs/pSeries_defconfig @@ -88,7 +88,7 @@ CONFIG_IBMVIO=y CONFIG_IOMMU_VMERGE=y CONFIG_SMP=y CONFIG_NR_CPUS=128 -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y CONFIG_NUMA=y CONFIG_SCHED_SMT=y # CONFIG_PREEMPT is not set diff --git a/arch/ppc64/defconfig b/arch/ppc64/defconfig index 2f31bf3046f9..b8e2066dde77 100644 --- a/arch/ppc64/defconfig +++ b/arch/ppc64/defconfig @@ -89,7 +89,7 @@ CONFIG_BOOTX_TEXT=y CONFIG_IOMMU_VMERGE=y CONFIG_SMP=y CONFIG_NR_CPUS=32 -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y # CONFIG_NUMA is not set # CONFIG_SCHED_SMT is not set # CONFIG_PREEMPT is not set From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:47 -0700 Subject: [PATCH 044/199] [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG There is some confusion that arose when working on SPARSEMEM patch between what is needed for DISCONTIG vs. NUMA. Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently. All of the current NUMA implementations require an implementation of DISCONTIG. Because of this, quite a lot of code which is really needed for NUMA is actually under DISCONTIG #ifdefs. For SPARSEMEM, we changed some of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n case. Introducing this new NEED_MULTIPLE_NODES config option allows code that is needed for both NUMA or DISCONTIG to be separated out from code that is specific to DISCONTIG. One great advantage of this approach is that it doesn't require every architecture to be converted over. All of the current implementations should "just work", only the ones implementing SPARSEMEM will have to be fixed up. The change to free_area_init() makes it work inside, or out of the new config option. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++--- mm/Kconfig | 8 ++++++++ mm/page_alloc.c | 6 +++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39e912708e2a..95f4a780ea66 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, /* Returns the number of the current Node. */ #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) @@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data; #define MAX_NODES_SHIFT 1 #define pfn_to_nid(pfn) (0) -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NEED_MULTIPLE_NODES */ #include -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* diff --git a/mm/Kconfig b/mm/Kconfig index 69caa9d8674e..15c131393639 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,3 +23,11 @@ config DISCONTIGMEM endchoice +# +# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's +# to represent different areas of memory. This variable allows +# those dependencies to exist individually. +# +config NEED_MULTIPLE_NODES + def_bool y + depends on DISCONTIGMEM || NUMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1958358e29b0..20e239599db0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1972,18 +1972,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; EXPORT_SYMBOL(contig_page_data); +#endif void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, &contig_page_data, zones_size, + free_area_init_node(0, NODE_DATA(0), zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#endif #ifdef CONFIG_PROC_FS From 44d0f805c77902a22dda244fd092b4567066b2b9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:47 -0700 Subject: [PATCH 045/199] [PATCH] sparsemem: fix minor "defaults" issue in mm/Kconfig The following patch applies on top of 2.6.12-rc2-mm1. It fixes a minor user interaction issue, and an early reference to SPARSEMEM. This "choice" menu would always default to FLATMEM, as it was listed first. Move it to the end so that the other defaults have a chance first. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 15c131393639..0320f066228c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,8 +1,7 @@ choice prompt "Memory model" - default FLATMEM - default SPARSEMEM if ARCH_SPARSEMEM_DEFAULT default DISCONTIGMEM if ARCH_DISCONTIGMEM_DEFAULT + default FLATMEM config FLATMEM bool "Flat Memory" From 074ccf8016b61f4b40066f8d737ab31e17a6afd1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:48 -0700 Subject: [PATCH 046/199] [PATCH] mm/Kconfig: kill unused ARCH_FLATMEM_DISABLE This used to be used to disable FLATMEM selection, but I decided to change it to be done generically when DISCONTIG is enabled. The option is unused, so this kills it. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 4 ---- arch/parisc/Kconfig | 4 ---- arch/sh/Kconfig | 4 ---- 3 files changed, 12 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2710018ac809..94f5a8eb2c22 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -501,10 +501,6 @@ config ARCH_DISCONTIGMEM_ENABLE or have huge holes in the physical address space for other reasons. See for more. -config ARCH_FLATMEM_DISABLE - def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE - config NUMA bool "NUMA Support" depends on SGI_IP27 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index ab7cc4eb7fc1..ce327c799b44 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -157,10 +157,6 @@ config ARCH_DISCONTIGMEM_ENABLE or have huge holes in the physical address space for other reasons. See for more. -config ARCH_FLATMEM_DISABLE - def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE - source "mm/Kconfig" config PREEMPT diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ac2b865359b2..a7c8bfc11604 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -496,10 +496,6 @@ config ARCH_DISCONTIGMEM_ENABLE or have huge holes in the physical address space for other reasons. See for more. -config ARCH_FLATMEM_DISABLE - def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE - source "mm/Kconfig" config ZERO_PAGE_OFFSET From e1785e85b9c81c67b581b511ee4efac6c81e9edb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:49 -0700 Subject: [PATCH 047/199] [PATCH] mm/Kconfig: hide "Memory Model" selection menu I got some feedback from users who think that the new "Memory Model" menu is a little invasive. This patch will hide that menu, except when CONFIG_EXPERIMENTAL is enabled *or* when an individual architecture wants it. An individual arch may want to enable it because they've removed their arch-specific DISCONTIG prompt in favor of the mm/Kconfig one. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 0320f066228c..7d3a0054cba1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,9 +1,14 @@ +config SELECT_MEMORY_MODEL + def_bool y + depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL + choice prompt "Memory model" - default DISCONTIGMEM if ARCH_DISCONTIGMEM_DEFAULT - default FLATMEM + depends on SELECT_MEMORY_MODEL + default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT + default FLATMEM_MANUAL -config FLATMEM +config FLATMEM_MANUAL bool "Flat Memory" depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE help @@ -14,7 +19,7 @@ config FLATMEM If unsure, choose this option over any other. -config DISCONTIGMEM +config DISCONTIGMEM_MANUAL bool "Discontigious Memory" depends on ARCH_DISCONTIGMEM_ENABLE help @@ -22,6 +27,14 @@ config DISCONTIGMEM endchoice +config DISCONTIGMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL + +config FLATMEM + def_bool y + depends on !DISCONTIGMEM || FLATMEM_MANUAL + # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows From 785dcd44b60ec8ede76fed0af54333ab5f3e848c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:50 -0700 Subject: [PATCH 048/199] [PATCH] mm/Kconfig: give DISCONTIG more help text This gives DISCONTIGMEM a bit more help text to explain what it does, not just when to choose it. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index 7d3a0054cba1..8283685f3bb3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,6 +23,16 @@ config DISCONTIGMEM_MANUAL bool "Discontigious Memory" depends on ARCH_DISCONTIGMEM_ENABLE help + This option provides enhanced support for discontiguous + memory systems, over FLATMEM. These systems have holes + in their physical address spaces, and this option provides + more efficient handling of these holes. However, the vast + majority of hardware has quite flat address spaces, and + can have degraded performance from extra overhead that + this option imposes. + + Many NUMA configurations will have this as the only option. + If unsure, choose "Flat Memory" over this option. endchoice From 368a0a3afad08069b2679ecaff80fc18c10a6e2a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 23 Jun 2005 00:07:51 -0700 Subject: [PATCH 049/199] [PATCH] ppc64: Kconfig memory models This patch changes some of the default behavior in the ppc64 Kconfig file that was recently changed/added to 2.6.12-rc2-mm1 by Dave Hansen in preparation for SPARSEMEM. Patch allows the display of both FLAT and DISCONTIG models on pseries. As before, default is DISCONTIG for SMP and PSERIES and FLAT for others. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/Kconfig | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 87344f324233..011b5c0bf1d0 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -199,9 +199,16 @@ config HMT pSeries systems p620 and p660 have such a cpu type. config ARCH_DISCONTIGMEM_ENABLE - bool "Discontiguous Memory Support" + def_bool y depends on SMP && PPC_PSERIES +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + +config ARCH_FLATMEM_ENABLE + def_bool y + source "mm/Kconfig" config NUMA From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:52 -0700 Subject: [PATCH 050/199] [PATCH] generify early_pfn_to_nid Provide a default implementation for early_pfn_to_nid returning node 0. Allow architectures to override this with their own implementation out of asm/mmzone.h. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 4 ++++ include/asm-i386/mmzone.h | 3 +++ include/linux/mmzone.h | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 8e5242c8e09d..a8128f997339 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -810,6 +810,10 @@ config HAVE_ARCH_ALLOC_REMAP source "mm/Kconfig" +config HAVE_ARCH_EARLY_PFN_TO_NID + bool + default y + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM4G || HIGHMEM64G diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 9cec191f462c..48e46d403aa6 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -143,4 +143,7 @@ static inline void get_memcfg_numa(void) } #endif /* CONFIG_DISCONTIGMEM */ + +extern int early_pfn_to_nid(unsigned long pfn); + #endif /* _ASM_MMZONE_H_ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 95f4a780ea66..6ef07de98d69 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data; #endif +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(nid) (0UL) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ From af705362ab6018071310c5fcd436a6b457517d5f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:53 -0700 Subject: [PATCH 051/199] [PATCH] generify memory present Allow architectures to indicate that they will be providing hooks to indice installed memory areas, memory_present(). Provide prototypes for the i386 implementation. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 2 +- mm/Kconfig | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index a8128f997339..3b7248126d29 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -793,7 +793,7 @@ config HAVE_ARCH_BOOTMEM_NODE depends on NUMA default y -config HAVE_MEMORY_PRESENT +config ARCH_HAVE_MEMORY_PRESENT bool depends on DISCONTIGMEM default y diff --git a/mm/Kconfig b/mm/Kconfig index 8283685f3bb3..5127441561b4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -53,3 +53,7 @@ config FLATMEM config NEED_MULTIPLE_NODES def_bool y depends on DISCONTIGMEM || NUMA + +config HAVE_MEMORY_PRESENT + def_bool y + depends on ARCH_HAVE_MEMORY_PRESENT From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:54 -0700 Subject: [PATCH 052/199] [PATCH] sparsemem memory model Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Adrian Bunk Signed-off-by: Yasunori Goto Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 1 + include/linux/mm.h | 92 ++++++++++++++++++++++++++++++++-------- include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++ include/linux/numa.h | 2 +- mm/Kconfig | 38 +++++++++++++++-- mm/Makefile | 1 + mm/bootmem.c | 9 +++- mm/memory.c | 2 +- mm/page_alloc.c | 39 +++++++++++++---- mm/sparse.c | 85 +++++++++++++++++++++++++++++++++++++ 10 files changed, 332 insertions(+), 33 deletions(-) create mode 100644 mm/sparse.c diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 3b7248126d29..f0064b5e3702 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -813,6 +813,7 @@ source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID bool default y + depends on NUMA config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" diff --git a/include/linux/mm.h b/include/linux/mm.h index 57b2ead51dba..6eb7f48317f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,40 +397,80 @@ static inline void put_page(struct page *page) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | NODE | ZONE | ... | FLAGS | */ -#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * page->flags layout: + * + * There are three possibilities for how page->flags get + * laid out. The first is for the normal case, without + * sparsemem. The second is for sparsemem when there is + * plenty of space for node and section. The last is when + * we have run out of space and have to fall back to an + * alternate (slower) way of determining the node. + * + * No sparsemem: | NODE | ZONE | ... | FLAGS | + * with space for node: | SECTION | NODE | ZONE | ... | FLAGS | + * no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#ifdef CONFIG_SPARSEMEM +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED +#define NODES_WIDTH NODES_SHIFT +#else +#define NODES_WIDTH 0 +#endif + +/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) + +#ifndef PFN_SECTION_SHIFT +#define PFN_SECTION_SHIFT 0 +#endif /* * Define the bit shifts to access each section. For non-existant * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE is used to lookup the zone from a page. */ +/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ +#if FLAGS_HAS_NODE #define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#else +#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#endif #define ZONETABLE_PGSHIFT ZONES_PGSHIFT -#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED -#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED +#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #endif -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) -#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) static inline unsigned long page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } -static inline unsigned long page_to_nid(struct page *page) -{ - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; -} struct zone; extern struct zone *zone_table[]; @@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page) ZONETABLE_MASK]; } +static inline unsigned long page_to_nid(struct page *page) +{ + if (FLAGS_HAS_NODE) + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + else + return page_zone(page)->zone_pgdat->node_id; +} +static inline unsigned long page_to_section(struct page *page) +{ + return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; +} + static inline void set_page_zone(struct page *page, unsigned long zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } +static inline void set_page_section(struct page *page, unsigned long section) +{ + page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} static inline void set_page_links(struct page *page, unsigned long zone, - unsigned long node) + unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); + set_page_section(page, pfn_to_section_nr(pfn)); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ef07de98d69..19860d317ec2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -269,7 +269,9 @@ typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[GFP_ZONETYPES]; int nr_zones; +#ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; +#endif struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ @@ -284,7 +286,11 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#else +#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) +#endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; @@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data; #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#ifdef CONFIG_SPARSEMEM +#include +#endif + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. @@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data; #define early_pfn_to_nid(nid) (0UL) #endif +#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) +#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) + +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) + +#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_ORDER exceeds SECTION_SIZE +#endif + +struct page; +struct mem_section { + struct page *section_mem_map; +}; + +extern struct mem_section mem_section[NR_MEM_SECTIONS]; + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return &mem_section[pfn_to_section_nr(pfn)]; +} + +#define pfn_to_page(pfn) \ +({ \ + unsigned long __pfn = (pfn); \ + __pfn_to_section(__pfn)->section_mem_map + __pfn; \ +}) +#define page_to_pfn(page) \ +({ \ + page - mem_section[page_to_section(page)].section_mem_map; \ +}) + +static inline int pfn_valid(unsigned long pfn) +{ + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) + return 0; + return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; +} + +/* + * These are _only_ used during initialisation, therefore they + * can use __initdata ... They could have names to indicate + * this restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid early_pfn_to_nid +#endif + +#define pfn_to_pgdat(pfn) \ +({ \ + NODE_DATA(pfn_to_nid(pfn)); \ +}) + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#else +#define sparse_init() do {} while (0) +#endif /* CONFIG_SPARSEMEM */ + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif + +void memory_present(int nid, unsigned long start, unsigned long end); +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/include/linux/numa.h b/include/linux/numa.h index bd0c8c4e9a95..f0c539bd3cfc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifndef CONFIG_FLATMEM #include #endif diff --git a/mm/Kconfig b/mm/Kconfig index 5127441561b4..cd379936cac6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -6,6 +6,7 @@ choice prompt "Memory model" depends on SELECT_MEMORY_MODEL default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT + default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL config FLATMEM_MANUAL @@ -17,7 +18,15 @@ config FLATMEM_MANUAL only have one option here: FLATMEM. This is normal and a correct option. - If unsure, choose this option over any other. + Some users of more advanced features like NUMA and + memory hotplug may have different options here. + DISCONTIGMEM is an more mature, better tested system, + but is incompatible with memory hotplug and may suffer + decreased performance over SPARSEMEM. If unsure between + "Sparse Memory" and "Discontiguous Memory", choose + "Discontiguous Memory". + + If unsure, choose this option (Flat Memory) over any other. config DISCONTIGMEM_MANUAL bool "Discontigious Memory" @@ -35,15 +44,38 @@ config DISCONTIGMEM_MANUAL If unsure, choose "Flat Memory" over this option. +config SPARSEMEM_MANUAL + bool "Sparse Memory" + depends on ARCH_SPARSEMEM_ENABLE + help + This will be the only option for some systems, including + memory hotplug systems. This is normal. + + For many other systems, this will be an alternative to + "Discontigious Memory". This option provides some potential + performance benefits, along with decreased code complexity, + but it is newer, and more experimental. + + If unsure, choose "Discontiguous Memory" or "Flat Memory" + over this option. + endchoice config DISCONTIGMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL +config SPARSEMEM + def_bool y + depends on SPARSEMEM_MANUAL + config FLATMEM def_bool y - depends on !DISCONTIGMEM || FLATMEM_MANUAL + depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + +config FLAT_NODE_MEM_MAP + def_bool y + depends on !SPARSEMEM # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's @@ -56,4 +88,4 @@ config NEED_MULTIPLE_NODES config HAVE_MEMORY_PRESENT def_bool y - depends on ARCH_HAVE_MEMORY_PRESENT + depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM diff --git a/mm/Makefile b/mm/Makefile index 097408064f6a..8f70ffd763c8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -15,6 +15,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 260e703850d8..f82f7aebbee3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -256,6 +256,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { struct page *page; + unsigned long pfn; bootmem_data_t *bdata = pgdat->bdata; unsigned long i, count, total = 0; unsigned long idx; @@ -266,7 +267,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) count = 0; /* first extant page of the node */ - page = virt_to_page(phys_to_virt(bdata->node_boot_start)); + pfn = bdata->node_boot_start >> PAGE_SHIFT; idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); map = bdata->node_bootmem_map; /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ @@ -275,9 +276,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) gofast = 1; for (i = 0; i < idx; ) { unsigned long v = ~map[i / BITS_PER_LONG]; + if (gofast && v == ~0UL) { int j, order; + page = pfn_to_page(pfn); count += BITS_PER_LONG; __ClearPageReserved(page); order = ffs(BITS_PER_LONG) - 1; @@ -292,6 +295,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) page += BITS_PER_LONG; } else if (v) { unsigned long m; + + page = pfn_to_page(pfn); for (m = 1; m && i < idx; m<<=1, page++, i++) { if (v & m) { count++; @@ -302,8 +307,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) } } else { i+=BITS_PER_LONG; - page += BITS_PER_LONG; } + pfn += BITS_PER_LONG; } total += count; diff --git a/mm/memory.c b/mm/memory.c index da91b7bf9986..30975ef48722 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -58,7 +58,7 @@ #include #include -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; struct page *mem_map; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20e239599db0..5c1b8982a6da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages); * Used by page_zone() to look up the address of the struct zone whose * id is encoded in the upper bits of page->flags */ -struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +struct zone *zone_table[1 << ZONETABLE_SHIFT]; EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; @@ -1649,11 +1649,15 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { - struct page *start = pfn_to_page(start_pfn); struct page *page; + int end_pfn = start_pfn + size; + int pfn; - for (page = start; page < (start + size); page++) { - set_page_links(page, zone, nid); + for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + if (!early_pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); set_page_count(page, 0); reset_page_mapcount(page); SetPageReserved(page); @@ -1677,6 +1681,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, } } +#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) +void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size) +{ + unsigned long snum = pfn_to_section_nr(pfn); + unsigned long end = pfn_to_section_nr(pfn + size); + + if (FLAGS_HAS_NODE) + zone_table[ZONETABLE_INDEX(nid, zid)] = zone; + else + for (; snum <= end; snum++) + zone_table[ZONETABLE_INDEX(snum, zid)] = zone; +} + #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ memmap_init_zone((size), (nid), (zone), (start_pfn)) @@ -1861,7 +1879,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long size, realsize; unsigned long batch; - zone_table[NODEZONE(nid, j)] = zone; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; @@ -1927,6 +1944,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, memmap_init(size, nid, j, zone_start_pfn); + zonetable_add(zone, nid, j, zone_start_pfn, size); + zone_start_pfn += size; zone_init_free_lists(pgdat, zone, zone->spanned_pages); @@ -1935,28 +1954,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat, static void __init alloc_node_mem_map(struct pglist_data *pgdat) { - unsigned long size; - struct page *map; - /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; +#ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { + unsigned long size; + struct page *map; + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = alloc_bootmem_node(pgdat, size); pgdat->node_mem_map = map; } -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * With no DISCONTIG, the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) mem_map = NODE_DATA(0)->node_mem_map; #endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } void __init free_area_init_node(int nid, struct pglist_data *pgdat, diff --git a/mm/sparse.c b/mm/sparse.c new file mode 100644 index 000000000000..f888385b9e14 --- /dev/null +++ b/mm/sparse.c @@ -0,0 +1,85 @@ +/* + * sparse memory mappings. + */ +#include +#include +#include +#include +#include +#include + +/* + * Permanent SPARSEMEM data: + * + * 1) mem_section - memory sections, mem_map's for valid memory + */ +struct mem_section mem_section[NR_MEM_SECTIONS]; +EXPORT_SYMBOL(mem_section); + +/* Record a memory area against a node. */ +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + start &= PAGE_SECTION_MASK; + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + unsigned long section = pfn_to_section_nr(pfn); + if (!mem_section[section].section_mem_map) + mem_section[section].section_mem_map = (void *) -1; + } +} + +/* + * Only used by the i386 NUMA architecures, but relatively + * generic code. + */ +unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long nr_pages = 0; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + if (nid != early_pfn_to_nid(pfn)) + continue; + + if (pfn_valid(pfn)) + nr_pages += PAGES_PER_SECTION; + } + + return nr_pages * sizeof(struct page); +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void sparse_init(void) +{ + unsigned long pnum; + struct page *map; + int nid; + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!mem_section[pnum].section_mem_map) + continue; + + nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); + if (!map) + map = alloc_bootmem_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (!map) { + mem_section[pnum].section_mem_map = 0; + continue; + } + + /* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ + mem_section[pnum].section_mem_map = map - + section_nr_to_pfn(pnum); + } +} From 05b79bdcb48c18cd9b580c39e3efb9a1ab078151 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:57 -0700 Subject: [PATCH 053/199] [PATCH] sparsemem memory model for i386 Provide the architecture specific implementation for SPARSEMEM for i386 SMP and NUMA systems. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 24 +++++++--- arch/i386/kernel/setup.c | 8 ++-- arch/i386/mm/Makefile | 2 +- arch/i386/mm/discontig.c | 36 ++++++++------- arch/i386/mm/init.c | 18 ++++---- include/asm-i386/mmzone.h | 89 +++++++++++++++++++----------------- include/asm-i386/page.h | 4 +- include/asm-i386/pgtable.h | 4 +- include/asm-i386/sparsemem.h | 31 +++++++++++++ 9 files changed, 135 insertions(+), 81 deletions(-) create mode 100644 include/asm-i386/sparsemem.h diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f0064b5e3702..bfdcedef06e1 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -68,7 +68,6 @@ config X86_VOYAGER config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" - select DISCONTIGMEM select NUMA help This option is used for getting Linux to run on a (IBM/Sequent) NUMA @@ -783,11 +782,6 @@ comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config ARCH_DISCONTIGMEM_ENABLE - bool - depends on NUMA - default y - config HAVE_ARCH_BOOTMEM_NODE bool depends on NUMA @@ -800,7 +794,7 @@ config ARCH_HAVE_MEMORY_PRESENT config NEED_NODE_MEMMAP_SIZE bool - depends on DISCONTIGMEM + depends on DISCONTIGMEM || SPARSEMEM default y config HAVE_ARCH_ALLOC_REMAP @@ -808,6 +802,22 @@ config HAVE_ARCH_ALLOC_REMAP depends on NUMA default y +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_SELECT_MEMORY_MODEL + def_bool y + depends on ARCH_SPARSEMEM_ENABLE + source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2bfbddebdbf8..0d689335c8d6 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -1022,7 +1023,7 @@ static void __init reserve_ebda_region(void) reserve_bootmem(addr, PAGE_SIZE); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { @@ -1072,9 +1073,9 @@ void __init zone_sizes_init(void) free_area_init(zones_size); } #else -extern unsigned long setup_memory(void); +extern unsigned long __init setup_memory(void); extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { @@ -1475,6 +1476,7 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); remapped_pgdat_init(); + sparse_init(); zone_sizes_init(); /* diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile index fc3272506846..80908b5aa60f 100644 --- a/arch/i386/mm/Makefile +++ b/arch/i386/mm/Makefile @@ -4,7 +4,7 @@ obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_DISCONTIGMEM) += discontig.o +obj-$(CONFIG_NUMA) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 088ca4722183..0efeb96ba5d4 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -42,12 +42,16 @@ bootmem_data_t node0_bdata; * populated the following initialisation. * * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) physnode_map - the mapping between a pfn and owning node - * 3) node_start_pfn - the starting page frame number for a node + * 2) node_start_pfn - the starting page frame number for a node * 3) node_end_pfn - the ending page fram number for a node */ +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; + +#ifdef CONFIG_DISCONTIGMEM /* + * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic * numa node on a 256Mb break (each element of the array will * represent 256Mb of memory and will be marked by the node id. so, @@ -85,9 +89,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, return (nr_pages + 1) * sizeof(struct page); } - -unsigned long node_start_pfn[MAX_NUMNODES]; -unsigned long node_end_pfn[MAX_NUMNODES]; +#endif extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); @@ -390,24 +392,26 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; + struct page *page; for_each_zone(zone) { - unsigned long node_pfn, node_high_size, zone_start_pfn; - struct page * zone_mem_map; - + unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + if (!is_highmem(zone)) continue; - printk("Initializing %s for node %d\n", zone->name, - zone->zone_pgdat->node_id); - - node_high_size = zone->spanned_pages; - zone_mem_map = zone->zone_mem_map; zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + printk("Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, zone->zone_pgdat->node_id, + zone_start_pfn, zone_end_pfn); - for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { - one_highpage_init((struct page *)(zone_mem_map + node_pfn), - zone_start_pfn + node_pfn, bad_ppro); + for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + one_highpage_init(page, node_pfn, bad_ppro); } } totalram_pages += totalhigh_pages; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 666ca79fb50a..48ebfab77a3c 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -276,7 +276,9 @@ void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA +extern void set_highmem_pages_init(int); +#else static void __init set_highmem_pages_init(int bad_ppro) { int pfn; @@ -284,9 +286,7 @@ static void __init set_highmem_pages_init(int bad_ppro) one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } -#else -extern void set_highmem_pages_init(int); -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #else #define kmap_init() do { } while (0) @@ -297,10 +297,10 @@ extern void set_highmem_pages_init(int); unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifndef CONFIG_DISCONTIGMEM -#define remap_numa_kva() do {} while (0) -#else +#ifdef CONFIG_NUMA extern void __init remap_numa_kva(void); +#else +#define remap_numa_kva() do {} while (0) #endif static void __init pagetable_init (void) @@ -525,7 +525,7 @@ static void __init set_max_mapnr_init(void) #else num_physpages = max_low_pfn; #endif -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif } @@ -539,7 +539,7 @@ void __init mem_init(void) int tmp; int bad_ppro; -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM if (!mem_map) BUG(); #endif diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 48e46d403aa6..33ce5d37e894 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -8,7 +8,9 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#if CONFIG_NUMA +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) #ifdef CONFIG_NUMA #ifdef CONFIG_X86_NUMAQ @@ -21,8 +23,28 @@ #define get_zholes_size(n) (0) #endif /* CONFIG_NUMA */ -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) +extern int get_memcfg_numa_flat(void ); +/* + * This allows any one NUMA architecture to be compiled + * for, and still fall back to the flat function if it + * fails. + */ +static inline void get_memcfg_numa(void) +{ +#ifdef CONFIG_X86_NUMAQ + if (get_memcfg_numaq()) + return; +#elif CONFIG_ACPI_SRAT + if (get_memcfg_from_srat()) + return; +#endif + + get_memcfg_numa_flat(); +} + +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_DISCONTIGMEM /* * generic node memory support, the following assumptions apply: @@ -48,26 +70,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) - #define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) /* @@ -121,28 +123,33 @@ static inline int pfn_valid(int pfn) return (pfn < node_end_pfn(nid)); return 0; } -#endif +#endif /* CONFIG_X86_NUMAQ */ + +#endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_NEED_MULTIPLE_NODES -extern int get_memcfg_numa_flat(void ); /* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. + * Following are macros that are specific to this numa platform. */ -static inline void get_memcfg_numa(void) -{ -#ifdef CONFIG_X86_NUMAQ - if (get_memcfg_numaq()) - return; -#elif CONFIG_ACPI_SRAT - if (get_memcfg_from_srat()) - return; -#endif - - get_memcfg_numa_flat(); -} +#define reserve_bootmem(addr, size) \ + reserve_bootmem_node(NODE_DATA(0), (addr), (size)) +#define alloc_bootmem(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) +#define alloc_bootmem_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) +#define alloc_bootmem_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ extern int early_pfn_to_nid(unsigned long pfn); diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 8f3dded01bff..dea8f8e6d86e 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -137,11 +137,11 @@ extern int page_is_ram(unsigned long pagenr); #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index e9efe148fdf7..77c6497f416e 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -398,9 +398,9 @@ extern void noexec_setup(const char *str); #endif /* !__ASSEMBLY__ */ -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #define io_remap_page_range(vma, vaddr, paddr, size, prot) \ remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) diff --git a/include/asm-i386/sparsemem.h b/include/asm-i386/sparsemem.h new file mode 100644 index 000000000000..cfeed990585f --- /dev/null +++ b/include/asm-i386/sparsemem.h @@ -0,0 +1,31 @@ +#ifndef _I386_SPARSEMEM_H +#define _I386_SPARSEMEM_H +#ifdef CONFIG_SPARSEMEM + +/* + * generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + +/* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#ifdef CONFIG_X86_PAE +#define SECTION_SIZE_BITS 30 +#define MAX_PHYSADDR_BITS 36 +#define MAX_PHYSMEM_BITS 36 +#else +#define SECTION_SIZE_BITS 26 +#define MAX_PHYSADDR_BITS 32 +#define MAX_PHYSMEM_BITS 32 +#endif + +/* XXX: FIXME -- wli */ +#define kern_addr_valid(kaddr) (0) + +#endif /* CONFIG_SPARSEMEM */ +#endif /* _I386_SPARSEMEM_H */ From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:59 -0700 Subject: [PATCH 054/199] [PATCH] sparsemem swiss cheese numa layouts The part of the sparsemem patch which modifies memmap_init_zone() has recently become a problem. It changes behavior so that there is a call to pfn_to_page() for each individual page inside of a node's range: node_start_pfn through node_end_pfn. It used to simply do this once, at the beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside of a node made it necessary to change. Mike Kravetz recently wrote a patch which made the NUMA code accept some new kinds of layouts. The system's memory was laid out like this, with node 0's memory in two pieces: one before and one after node 1's memory: Node 0: +++++ +++++ Node 1: +++++ Previous behavior before Mike's patch was to assign nodes like this: Node 0: 00000 XXXXX Node 1: 11111 Where the 'X' areas were simply thrown away. The new behavior was to make the pg_data_t span node 0 across all of its areas, including areas that are really node 1's: Node 0: 000000000000000 Node 1: 11111 This wastes a little bit of mem_map space, but ends up being OK, and more fully utilizes the system's memory. memmap_init_zone() initializes all of the "struct page"s for node 0, even for the "hole", but those never get used, because there is no pfn_to_page() that resolves to those pages. However, only calling pfn_to_page() once, memmap_init_zone() always uses the pages that were allocated for node0->node_mem_map because: struct page *start = pfn_to_page(start_pfn); // effectively start = &node->node_mem_map[0] for (page = start; page < (start + size); page++) { init_page_here();... page++; } Slow, and wasteful, but generally harmless. But, modify that to call pfn_to_page() for each loop iteration (like sparsemem does): for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) { page = pfn_to_page(pfn); } And you end up trying to initialize node 1's pages too early, along with bogus data from node 0. This patch checks for those weird layouts and declines to touch the pages, making the more frequent pfn_to_page() calls OK to do. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/Kconfig | 12 ++++++++++++ include/linux/mmzone.h | 6 ++++++ mm/page_alloc.c | 2 ++ 3 files changed, 20 insertions(+) diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 011b5c0bf1d0..85f8fcf44b6c 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -211,6 +211,18 @@ config ARCH_FLATMEM_ENABLE source "mm/Kconfig" +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. +# +# This is a relatively temporary hack that should +# be able to go away when sparsemem is fully in +# place +config NODES_SPAN_OTHER_NODES + def_bool y + depends on NEED_MULTIPLE_NODES + config NUMA bool "NUMA support" depends on DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 19860d317ec2..746b57e3d370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -528,6 +528,12 @@ void sparse_init(void); #define sparse_init() do {} while (0) #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c1b8982a6da..1eb683f9b3af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1656,6 +1656,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 0); From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:00 -0700 Subject: [PATCH 055/199] [PATCH] sparsemem hotplug base Make sparse's initalization be accessible at runtime. This allows sparse mappings to be created after boot in a hotplug situation. This patch is separated from the previous one just to give an indication how much of the sparse infrastructure is *just* for hotplug memory. The section_mem_map doesn't really store a pointer. It stores something that is convenient to do some math against to get a pointer. It isn't valid to just do *section_mem_map, so I don't think it should be stored as a pointer. There are a couple of things I'd like to store about a section. First of all, the fact that it is !NULL does not mean that it is present. There could be such a combination where section_mem_map *is* NULL, but the math gets you properly to a real mem_map. So, I don't think that check is safe. Since we're storing 32-bit-aligned structures, we have a few bits in the bottom of the pointer to play with. Use one bit to encode whether there's really a mem_map there, and the other one to tell whether there's a valid section there. We need to distinguish between the two because sometimes there's a gap between when a section is discovered to be present and when we can get the mem_map for it. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Jack Steiner Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 56 ++++++++++++++++++++++--- mm/page_alloc.c | 4 +- mm/sparse.c | 92 +++++++++++++++++++++++++++++++++--------- 3 files changed, 125 insertions(+), 27 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 746b57e3d370..6c90461ed99f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data; struct page; struct mem_section { - struct page *section_mem_map; + /* + * This is, logically, a pointer to an array of struct + * pages. However, it is stored with some other magic. + * (see sparse.c::sparse_init_one_section()) + * + * Making it a UL at least makes someone do a cast + * before using it wrong. + */ + unsigned long section_mem_map; }; extern struct mem_section mem_section[NR_MEM_SECTIONS]; +static inline struct mem_section *__nr_to_section(unsigned long nr) +{ + return &mem_section[nr]; +} + +/* + * We use the lower bits of the mem_map pointer to store + * a little bit of information. There should be at least + * 3 bits here due to 32-bit alignment. + */ +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) + +static inline struct page *__section_mem_map_addr(struct mem_section *section) +{ + unsigned long map = section->section_mem_map; + map &= SECTION_MAP_MASK; + return (struct page *)map; +} + +static inline int valid_section(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_MARKED_PRESENT); +} + +static inline int section_has_mem_map(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_HAS_MEM_MAP); +} + +static inline int valid_section_nr(unsigned long nr) +{ + return valid_section(__nr_to_section(nr)); +} + /* * Given a kernel address, find the home node of the underlying memory. */ @@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS]; static inline struct mem_section *__pfn_to_section(unsigned long pfn) { - return &mem_section[pfn_to_section_nr(pfn)]; + return __nr_to_section(pfn_to_section_nr(pfn)); } #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ - __pfn_to_section(__pfn)->section_mem_map + __pfn; \ + __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ }) #define page_to_pfn(page) \ ({ \ - page - mem_section[page_to_section(page)].section_mem_map; \ + page - __section_mem_map_addr(__nr_to_section( \ + page_to_section(page))); \ }) static inline int pfn_valid(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; + return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1eb683f9b3af..7ee675ad101e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1650,8 +1650,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; - int end_pfn = start_pfn + size; - int pfn; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) diff --git a/mm/sparse.c b/mm/sparse.c index f888385b9e14..b54e304df4a7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -25,7 +25,7 @@ void memory_present(int nid, unsigned long start, unsigned long end) for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { unsigned long section = pfn_to_section_nr(pfn); if (!mem_section[section].section_mem_map) - mem_section[section].section_mem_map = (void *) -1; + mem_section[section].section_mem_map = SECTION_MARKED_PRESENT; } } @@ -50,6 +50,56 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, return nr_pages * sizeof(struct page); } +/* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ +static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) +{ + return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); +} + +/* + * We need this if we ever free the mem_maps. While not implemented yet, + * this function is included for parity with its sibling. + */ +static __attribute((unused)) +struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) +{ + return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); +} + +static int sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map) +{ + if (!valid_section(ms)) + return -EINVAL; + + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); + + return 1; +} + +static struct page *sparse_early_mem_map_alloc(unsigned long pnum) +{ + struct page *map; + int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); + + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + + map = alloc_bootmem_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + + printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); + mem_section[pnum].section_mem_map = 0; + return NULL; +} + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -58,28 +108,30 @@ void sparse_init(void) { unsigned long pnum; struct page *map; - int nid; for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!mem_section[pnum].section_mem_map) + if (!valid_section_nr(pnum)) continue; - nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); - map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); - if (!map) - map = alloc_bootmem_node(NODE_DATA(nid), - sizeof(struct page) * PAGES_PER_SECTION); - if (!map) { - mem_section[pnum].section_mem_map = 0; - continue; - } - - /* - * Subtle, we encode the real pfn into the mem_map such that - * the identity pfn - section_mem_map will return the actual - * physical page frame number. - */ - mem_section[pnum].section_mem_map = map - - section_nr_to_pfn(pnum); + map = sparse_early_mem_map_alloc(pnum); + if (map) + sparse_init_one_section(&mem_section[pnum], pnum, map); } } + +/* + * returns the number of sections whose mem_maps were properly + * set. If this is <=0, then that means that the passed-in + * map was not consumed and must be freed. + */ +int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +{ + struct mem_section *ms = __pfn_to_section(start_pfn); + + if (ms->section_mem_map & SECTION_MARKED_PRESENT) + return -EEXIST; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; + + return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); +} From 510f8fa7ba18320d408dd3093663e58f5664f2f0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:01 -0700 Subject: [PATCH 056/199] [PATCH] ppc64: add early_pfn_to_nid Provide an implementation of early_pfn_to_nid for PPC64. This is used by memory models to determine the node from which to take allocations before the memory allocators are fully initialised. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/Kconfig | 4 ++++ include/asm-ppc64/mmzone.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 85f8fcf44b6c..3c8bfd0345cc 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -211,6 +211,10 @@ config ARCH_FLATMEM_ENABLE source "mm/Kconfig" +config HAVE_ARCH_EARLY_PFN_TO_NID + bool + default y + # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and # between a node's start and end pfns, it may not diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index cbfc5ecfe875..ecc4b07507d6 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -90,4 +90,9 @@ static inline int pa_to_nid(unsigned long pa) #define discontigmem_pfn_valid(pfn) ((pfn) < num_physpages) #endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) +#endif + #endif /* _ASM_MMZONE_H_ */ From 74b30be2e183bd9a12d0350698996e3d1969f290 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:02 -0700 Subject: [PATCH 057/199] [PATCH] ppc64: add memory present Provide hooks for PPC64 to allow memory models to be informed of installed memory areas. This allows SPARSEMEM to instantiate mem_map for the populated areas. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/Kconfig | 4 ++-- arch/ppc64/mm/numa.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 3c8bfd0345cc..c92d48fe06e5 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -212,8 +212,8 @@ config ARCH_FLATMEM_ENABLE source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID - bool - default y + def_bool y + depends on NEED_MULTIPLE_NODES # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c index ea862ec643d3..cafd91aef289 100644 --- a/arch/ppc64/mm/numa.c +++ b/arch/ppc64/mm/numa.c @@ -440,6 +440,8 @@ static int __init parse_numa_properties(void) for (i = start ; i < (start+size); i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = numa_domain; + memory_present(numa_domain, start >> PAGE_SHIFT, + (start + size) >> PAGE_SHIFT); if (--ranges) goto new_range; @@ -481,6 +483,7 @@ static void __init setup_nonnuma(void) for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; + memory_present(0, 0, init_node_data[0].node_end_pfn); } static void __init dump_numa_topology(void) From 145e664231648121026d470094c200851a446a73 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:03 -0700 Subject: [PATCH 058/199] [PATCH] ppc64: sparsemem memory model Provide the architecture specific implementation for SPARSEMEM for PPC64 systems. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Mike Kravetz (in part) Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/Kconfig | 13 ++++++++++++- arch/ppc64/kernel/setup.c | 1 + arch/ppc64/mm/Makefile | 2 +- arch/ppc64/mm/init.c | 18 +++++++++++++----- include/asm-ppc64/mmzone.h | 36 ++++++++++++++++++++++------------- include/asm-ppc64/page.h | 3 ++- include/asm-ppc64/sparsemem.h | 16 ++++++++++++++++ 7 files changed, 68 insertions(+), 21 deletions(-) create mode 100644 include/asm-ppc64/sparsemem.h diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index c92d48fe06e5..6448231cb106 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -198,6 +198,13 @@ config HMT This option enables hardware multithreading on RS64 cpus. pSeries systems p620 and p660 have such a cpu type. +config ARCH_SELECT_MEMORY_MODEL + def_bool y + +config ARCH_FLATMEM_ENABLE + def_bool y + depends on !NUMA + config ARCH_DISCONTIGMEM_ENABLE def_bool y depends on SMP && PPC_PSERIES @@ -209,6 +216,10 @@ config ARCH_DISCONTIGMEM_DEFAULT config ARCH_FLATMEM_ENABLE def_bool y +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID @@ -229,7 +240,7 @@ config NODES_SPAN_OTHER_NODES config NUMA bool "NUMA support" - depends on DISCONTIGMEM + default y if DISCONTIGMEM || SPARSEMEM config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 8e439a817642..9e70ac90dec1 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -1055,6 +1055,7 @@ void __init setup_arch(char **cmdline_p) /* set up the bootmem stuff with available memory */ do_init_bootmem(); + sparse_init(); /* initialize the syscall map in systemcfg */ setup_syscall_map(); diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile index ac522d57b2a7..3695d00d347f 100644 --- a/arch/ppc64/mm/Makefile +++ b/arch/ppc64/mm/Makefile @@ -6,6 +6,6 @@ EXTRA_CFLAGS += -mno-minimal-toc obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \ slb_low.o slb.o stab.o mmap.o -obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 29dbe084c21f..b50b3a446dbe 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -531,7 +531,7 @@ EXPORT_SYMBOL(page_is_ram); * Initialize the bootmem system and give it all the memory we * have available. */ -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES void __init do_init_bootmem(void) { unsigned long i; @@ -553,12 +553,20 @@ void __init do_init_bootmem(void) max_pfn = max_low_pfn; - /* add all physical memory to the bootmem map. Also find the first */ + /* Add all physical memory to the bootmem map, mark each area + * present. + */ for (i=0; i < lmb.memory.cnt; i++) { unsigned long physbase, size; + unsigned long start_pfn, end_pfn; physbase = lmb.memory.region[i].physbase; size = lmb.memory.region[i].size; + + start_pfn = physbase >> PAGE_SHIFT; + end_pfn = start_pfn + (size >> PAGE_SHIFT); + memory_present(0, start_pfn, end_pfn); + free_bootmem(physbase, size); } @@ -597,7 +605,7 @@ void __init paging_init(void) free_area_init_node(0, NODE_DATA(0), zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); } -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ static struct kcore_list kcore_vmem; @@ -628,7 +636,7 @@ module_init(setup_kcore); void __init mem_init(void) { -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif pg_data_t *pgdat; @@ -639,7 +647,7 @@ void __init mem_init(void) num_physpages = max_low_pfn; /* RAM is assumed contiguous */ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NEED_MULTIPLE_NODES for_each_online_node(nid) { if (NODE_DATA(nid)->node_spanned_pages != 0) { printk("freeing bootmem node %x\n", nid); diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index ecc4b07507d6..ed473f4b0152 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -10,9 +10,20 @@ #include #include -#ifdef CONFIG_DISCONTIGMEM +/* generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + + +#ifdef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data *node_data[]; +/* + * Return a pointer to the node data for node n. + */ +#define NODE_DATA(nid) (node_data[nid]) /* * Following are specific to this numa platform. @@ -47,30 +58,27 @@ static inline int pa_to_nid(unsigned long pa) return nid; } -#define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) - -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* * Following are macros that each numa implmentation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) +#ifdef CONFIG_DISCONTIGMEM + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) + +#define pfn_to_nid(pfn) pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT) + /* Written this way to avoid evaluating arguments twice */ #define discontigmem_pfn_to_page(pfn) \ ({ \ @@ -91,6 +99,8 @@ static inline int pa_to_nid(unsigned long pa) #endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ + #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID #define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) #endif diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index 257d87eb7c34..a5893a305a09 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -217,7 +217,8 @@ extern u64 ppc64_pft_size; /* Log 2 of page table size */ #define page_to_pfn(page) discontigmem_page_to_pfn(page) #define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) #define pfn_valid(pfn) discontigmem_pfn_valid(pfn) -#else +#endif +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) diff --git a/include/asm-ppc64/sparsemem.h b/include/asm-ppc64/sparsemem.h new file mode 100644 index 000000000000..c5bd47e57f17 --- /dev/null +++ b/include/asm-ppc64/sparsemem.h @@ -0,0 +1,16 @@ +#ifndef _ASM_PPC64_SPARSEMEM_H +#define _ASM_PPC64_SPARSEMEM_H 1 + +#ifdef CONFIG_SPARSEMEM +/* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#define SECTION_SIZE_BITS 24 +#define MAX_PHYSADDR_BITS 38 +#define MAX_PHYSMEM_BITS 36 + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_PPC64_SPARSEMEM_H */ From 073326634bec2f36e165583d21fb0a9fad47ac0a Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:03 -0700 Subject: [PATCH 059/199] [PATCH] remove direct ref to contig_page_data for x86-64 This patch pulls out all remaining direct references to contig_page_data from arch/x86-64, thus saving an ifdef in one case. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/aperture.c | 4 ---- arch/x86_64/kernel/setup.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 504e63474993..c9a6b812e926 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -40,11 +40,7 @@ int fix_aperture __initdata = 1; static u32 __init allocate_aperture(void) { -#ifdef CONFIG_DISCONTIGMEM pg_data_t *nd0 = NODE_DATA(0); -#else - pg_data_t *nd0 = &contig_page_data; -#endif u32 aper_size; void *p; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 99f038ede23c..125a0bb1f5d6 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -386,7 +386,7 @@ static void __init contig_initmem_init(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n",bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size); } #endif From 1035faf1b19efb83d5626985240f52cd149dd39b Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:05 -0700 Subject: [PATCH 060/199] [PATCH] add x86-64 Kconfig options for sparsemem Add the requisite arch specific Kconfig options to enable the use of the sparsemem implementation for NUMA kernels on x86-64. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/Kconfig | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 9f5b99e7b1eb..61ed16652347 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -274,8 +274,27 @@ config NUMA bool default n +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_FLATMEM_ENABLE + def_bool y + depends on !NUMA + source "mm/Kconfig" +config HAVE_ARCH_EARLY_PFN_TO_NID + def_bool y + config HAVE_DEC_LOCK bool depends on SMP From 2b97690f4cd960779fb351b7cd9974390afabb36 Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:06 -0700 Subject: [PATCH 061/199] [PATCH] reorganize x86-64 NUMA and DISCONTIGMEM config options In order to use the alternative sparsemem implmentation for NUMA kernels, we need to reorganize the config options. This patch effectively abstracts out the CONFIG_DISCONTIGMEM options to CONFIG_NUMA in most cases. Thus, the discontigmem implementation may be employed as always, but the sparsemem implementation may be used alternatively. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/head64.c | 2 +- arch/x86_64/kernel/setup.c | 6 +++--- arch/x86_64/mm/Makefile | 2 +- arch/x86_64/mm/init.c | 9 ++++++--- arch/x86_64/mm/ioremap.c | 2 +- include/asm-x86_64/io.h | 5 ----- include/asm-x86_64/mmzone.h | 15 +++++++++------ include/asm-x86_64/page.h | 4 +++- include/asm-x86_64/topology.h | 4 +--- 9 files changed, 25 insertions(+), 24 deletions(-) diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 0f8c78dcd38c..cf6ab147a2a5 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data) s = strstr(saved_command_line, "earlyprintk="); if (s != NULL) setup_early_printk(s); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA s = strstr(saved_command_line, "numa="); if (s != NULL) numa_setup(s+5); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 125a0bb1f5d6..dd8419b37a7e 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -348,7 +348,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "mem=", 4)) parse_memopt(from+4, &from); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA if (!memcmp(from, "numa=", 5)) numa_setup(from+5); #endif @@ -377,7 +377,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) *cmdline_p = command_line; } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NUMA static void __init contig_initmem_init(void) { unsigned long bootmap_size, bootmap; @@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else contig_initmem_init(); diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile index 66c354ad80ca..1d232a87f113 100644 --- a/arch/x86_64/mm/Makefile +++ b/arch/x86_64/mm/Makefile @@ -4,7 +4,7 @@ obj-y := init.o fault.o ioremap.o extable.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o obj-$(CONFIG_ACPI_NUMA) += srat.o diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index dbe53b4c7e66..72e4b364ed73 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -318,7 +318,7 @@ void zap_low_mappings(void) flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NUMA void __init paging_init(void) { { @@ -427,13 +427,16 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA totalram_pages += numa_free_all_bootmem(); tmp = 0; /* should count reserved pages here for all nodes */ #else + +#ifdef CONFIG_FLATMEM max_mapnr = end_pfn; if (!mem_map) BUG(); +#endif totalram_pages += free_all_bootmem(); @@ -515,7 +518,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA int nid = phys_to_nid(phys); reserve_bootmem_node(NODE_DATA(nid), phys, len); #else diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 58aac23760ef..6972df480d2b 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -178,7 +178,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) return (__force void __iomem *)phys_to_virt(phys_addr); -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * Don't allow anybody to remap normal RAM that we're using.. */ diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h index 94202703fae2..37fc3f149a5a 100644 --- a/include/asm-x86_64/io.h +++ b/include/asm-x86_64/io.h @@ -124,12 +124,7 @@ extern inline void * phys_to_virt(unsigned long address) /* * Change "struct page" to physical address. */ -#ifdef CONFIG_DISCONTIGMEM -#include #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#else -#define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) -#endif #include diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index ca4fc3fe0dee..768413751b34 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -6,7 +6,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA #define VIRTUAL_BUG_ON(x) @@ -30,17 +30,16 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) return nid; } -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) - -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) +#ifdef CONFIG_DISCONTIGMEM + +#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) +#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them @@ -57,4 +56,8 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) ({ u8 nid__ = pfn_to_nid(pfn); \ nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); })) #endif + +#define local_mapnr(kvaddr) \ + ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) +#endif #endif diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 9ce338c3a71e..60130f4ca986 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -119,7 +119,9 @@ extern __inline__ int get_order(unsigned long size) __pa(v); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_DISCONTIGMEM +#define __boot_va(x) __va(x) +#define __boot_pa(x) __pa(x) +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index 67f24e0ea819..da21573ec731 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA #include #include @@ -37,7 +37,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus) } #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) -#ifdef CONFIG_NUMA /* sched_domains SD_NODE_INIT for x86_64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ @@ -59,7 +58,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus) .balance_interval = 1, \ .nr_balance_failed = 0, \ } -#endif #endif From bbfceef47fb9467424113a004070bf37a806a97c Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:07 -0700 Subject: [PATCH 062/199] [PATCH] add x86-64 specific support for sparsemem This patch adds in the necessary support for sparsemem such that x86-64 kernels may use sparsemem as an alternative to discontigmem for NUMA kernels. Note that this does no preclude one from continuing to build NUMA kernels using discontigmem, but merely allows the option to build NUMA kernels with sparsemem. Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels results in reduced text size for otherwise equivalent kernels as shown in the example builds below: text data bss dec hex filename 2371036 765884 1237108 4374028 42be0c vmlinux.discontig 2366549 776484 1302772 4445805 43d66d vmlinux.sparse Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 27 +++++++++++++++++---------- arch/x86_64/mm/numa.c | 8 ++++++++ include/asm-x86_64/bitops.h | 2 -- include/asm-x86_64/sparsemem.h | 26 ++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 12 deletions(-) create mode 100644 include/asm-x86_64/sparsemem.h diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index dd8419b37a7e..000015dd5a8b 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -40,6 +40,8 @@ #include #include #include +#include + #include #include #include @@ -378,16 +380,19 @@ static __init void parse_cmdline_early (char ** cmdline_p) } #ifndef CONFIG_NUMA -static void __init contig_initmem_init(void) +static void __init +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long bootmap_size, bootmap; - bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, end_pfn); - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size); + unsigned long bootmap_size, bootmap; + + memory_present(0, start_pfn, end_pfn); + bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, end_pfn); + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size); } #endif @@ -557,7 +562,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else - contig_initmem_init(); + contig_initmem_init(0, end_pfn); #endif /* Reserve direct mapping */ @@ -618,6 +623,8 @@ void __init setup_arch(char **cmdline_p) } } #endif + + sparse_init(); paging_init(); check_ioapic(); diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index fd9f25d7a6c4..84cde796ecb1 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -66,6 +66,13 @@ int __init compute_hash_shift(struct node *nodes, int numnodes) return -1; } +#ifdef CONFIG_SPARSEMEM +int early_pfn_to_nid(unsigned long pfn) +{ + return phys_to_nid(pfn << PAGE_SHIFT); +} +#endif + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { @@ -80,6 +87,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; + memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 5dd7727c756b..a31bb99be53f 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -411,8 +411,6 @@ static __inline__ int ffs(int x) /* find last set bit */ #define fls(x) generic_fls(x) -#define ARCH_HAS_ATOMIC_UNSIGNED 1 - #endif /* __KERNEL__ */ #endif /* _X86_64_BITOPS_H */ diff --git a/include/asm-x86_64/sparsemem.h b/include/asm-x86_64/sparsemem.h new file mode 100644 index 000000000000..dabb16714a71 --- /dev/null +++ b/include/asm-x86_64/sparsemem.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_64_SPARSEMEM_H +#define _ASM_X86_64_SPARSEMEM_H 1 + +#ifdef CONFIG_SPARSEMEM + +/* + * generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the flags + * field of the struct page + * + * SECTION_SIZE_BITS 2^n: size of each section + * MAX_PHYSADDR_BITS 2^n: max size of physical address space + * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space + * + */ + +#define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ +#define MAX_PHYSADDR_BITS 40 +#define MAX_PHYSMEM_BITS 40 + +extern int early_pfn_to_nid(unsigned long pfn); + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_X86_64_SPARSEMEM_H */ From 6f4e1e5061c44a93337338af4bf9bed10ee9f32e Mon Sep 17 00:00:00 2001 From: "Martin J. Bligh" Date: Thu, 23 Jun 2005 00:08:08 -0700 Subject: [PATCH 063/199] [PATCH] add page_state info to show_mem This helps a lot when debugging out of memory stuff - useful especially to see if all the memory is sucked into slab, etc. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/pgtable.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 80c84cdf22ef..270c59f099a4 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -30,6 +30,7 @@ void show_mem(void) struct page *page; pg_data_t *pgdat; unsigned long i; + struct page_state ps; printk("Mem-info:\n"); show_free_areas(); @@ -53,6 +54,13 @@ void show_mem(void) printk("%d reserved pages\n",reserved); printk("%d pages shared\n",shared); printk("%d pages swap cached\n",cached); + + get_page_state(&ps); + printk("%lu pages dirty\n", ps.nr_dirty); + printk("%lu pages writeback\n", ps.nr_writeback); + printk("%lu pages mapped\n", ps.nr_mapped); + printk("%lu pages slab\n", ps.nr_slab); + printk("%lu pages pagetables\n", ps.nr_page_table_pages); } /* From d0e7feb03d5ac48069c2fd57bbba61522e0ca493 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 23 Jun 2005 00:08:09 -0700 Subject: [PATCH 064/199] [PATCH] biarch compiler support for i386 This allows the i386 architecture to be built on a system with a biarch compiler that defaults to x86-64, merely by specifying ARCH=i386. As previously discussed, this uses the equivalent logic to the ppc port. Signed-Off-By: H. Peter Anvin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 1c36ca332a96..bf7c9ba709f3 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -17,6 +17,13 @@ # 20050320 Kianusch Sayah Karadji # Added support for GEODE CPU +HAS_BIARCH := $(call cc-option-yn, -m32) +ifeq ($(HAS_BIARCH),y) +AS := $(AS) --32 +LD := $(LD) -m elf_i386 +CC := $(CC) -m32 +endif + LDFLAGS := -m elf_i386 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := From 0f8e2d62fa04441cd12c08ce521e84e5bd3f8a46 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 23 Jun 2005 00:08:10 -0700 Subject: [PATCH 065/199] [PATCH] use ${CROSS_COMPILE}installkernel in arch/*/boot/install.sh The attached patch causes the various arch specific install.sh scripts to look for ${CROSS_COMPILE}installkernel rather than just installkernel (in both /sbin/ and ~/bin/ where the script already did this). This allows you to have e.g. arm-linux-installkernel as a handy way to install on your cross target. It also prevents the script picking up on the host /sbin/installkernel which causes the script to fall through and do the install itself (which is what I actually use myself, with $INSTALL_PATH set). I don't believe it causes back-compatibility problems since calling the host installkernel was never likely to work or be what you wanted when cross compiling anyway. If $CROSS_COMPILE isn't set then nothing changes. I only use ARM and i386 myself but I figured it couldn't hurt to do the whole lot. I've cc'd those who I hope are the arch maintainers for files that I've touched. Signed-off-by: Ian Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/install.sh | 4 ++-- arch/arm26/boot/install.sh | 4 ++-- arch/i386/boot/install.sh | 4 ++-- arch/ppc64/boot/install.sh | 4 ++-- arch/s390/boot/install.sh | 4 ++-- arch/x86_64/boot/install.sh | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/install.sh b/arch/arm/boot/install.sh index 935bb27369e9..9f9bed207345 100644 --- a/arch/arm/boot/install.sh +++ b/arch/arm/boot/install.sh @@ -21,8 +21,8 @@ # # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi if [ "$(basename $2)" = "zImage" ]; then # Compressed install diff --git a/arch/arm26/boot/install.sh b/arch/arm26/boot/install.sh index c628328dd9ec..8a8399b26cf7 100644 --- a/arch/arm26/boot/install.sh +++ b/arch/arm26/boot/install.sh @@ -23,8 +23,8 @@ # User may have a custom install script -if [ -x /sbin/installkernel ]; then - exec /sbin/installkernel "$@" +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then + exec /sbin/${CROSS_COMPILE}installkernel "$@" fi if [ "$2" = "zImage" ]; then diff --git a/arch/i386/boot/install.sh b/arch/i386/boot/install.sh index 90f2452b3b9e..f17b40dfc0f4 100644 --- a/arch/i386/boot/install.sh +++ b/arch/i386/boot/install.sh @@ -21,8 +21,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install - same as make zlilo diff --git a/arch/ppc64/boot/install.sh b/arch/ppc64/boot/install.sh index 955c5681db6c..cb2d6626b555 100644 --- a/arch/ppc64/boot/install.sh +++ b/arch/ppc64/boot/install.sh @@ -22,8 +22,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index 278a8139cb18..d4026f62cb06 100644 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -21,8 +21,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install - same as make zlilo diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh index 90f2452b3b9e..f17b40dfc0f4 100644 --- a/arch/x86_64/boot/install.sh +++ b/arch/x86_64/boot/install.sh @@ -21,8 +21,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install - same as make zlilo From 8a9e1b0f564615bd92ba50162623e25c2904e564 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 23 Jun 2005 00:08:13 -0700 Subject: [PATCH 066/199] [PATCH] Platform SMIs and their interferance with tsc based delay calibration Issue: Current tsc based delay_calibration can result in significant errors in loops_per_jiffy count when the platform events like SMIs (System Management Interrupts that are non-maskable) are present. This could lead to potential kernel panic(). This issue is becoming more visible with 2.6 kernel (as default HZ is 1000) and on platforms with higher SMI handling latencies. During the boot time, SMIs are mostly used by BIOS (for things like legacy keyboard emulation). Description: The psuedocode for current delay calibration with tsc based delay looks like (0) Estimate a value for loops_per_jiffy (1) While (loops_per_jiffy estimate is accurate enough) (2) wait for jiffy transition (jiffy1) (3) Note down current tsc (tsc1) (4) loop until tsc becomes tsc1 + loops_per_jiffy (5) check whether jiffy changed since jiffy1 or not and refine loops_per_jiffy estimate Consider the following cases Case 1: If SMIs happen between (2) and (3) above, we can end up with a loops_per_jiffy value that is too low. This results in shorted delays and kernel can panic () during boot (Mostly at IOAPIC timer initialization timer_irq_works() as we don't have enough timer interrupts in a specified interval). Case 2: If SMIs happen between (3) and (4) above, then we can end up with a loops_per_jiffy value that is too high. And with current i386 code, too high lpj value (greater than 17M) can result in a overflow in delay.c:__const_udelay() again resulting in shorter delay and panic(). Solution: The patch below makes the calibration routine aware of asynchronous events like SMIs. We increase the delay calibration time and also identify any significant errors (greater than 12.5%) in the calibration and notify it to user. Patch below changes both i386 and x86-64 architectures to use this new and improved calibrate_delay_direct() routine. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/timers/common.c | 9 +++ arch/i386/kernel/timers/timer.c | 9 +++ arch/i386/kernel/timers/timer_hpet.c | 1 + arch/i386/kernel/timers/timer_pm.c | 1 + arch/i386/kernel/timers/timer_tsc.c | 1 + arch/x86_64/lib/delay.c | 7 +++ include/asm-i386/timer.h | 2 + include/asm-i386/timex.h | 3 + include/asm-x86_64/timex.h | 3 + init/calibrate.c | 94 ++++++++++++++++++++++++++++ 10 files changed, 130 insertions(+) diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c index 8e201219f525..b38cc0d0c71a 100644 --- a/arch/i386/kernel/timers/common.c +++ b/arch/i386/kernel/timers/common.c @@ -139,6 +139,15 @@ unsigned long __init calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) } #endif + +unsigned long read_timer_tsc(void) +{ + unsigned long retval; + rdtscl(retval); + return retval; +} + + /* calculate cpu_khz */ void init_cpu_khz(void) { diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c index a3d6a288088b..7e39ed8e33f8 100644 --- a/arch/i386/kernel/timers/timer.c +++ b/arch/i386/kernel/timers/timer.c @@ -64,3 +64,12 @@ struct timer_opts* __init select_timer(void) panic("select_timer: Cannot find a suitable timer\n"); return NULL; } + +int read_current_timer(unsigned long *timer_val) +{ + if (cur_timer->read_timer) { + *timer_val = cur_timer->read_timer(); + return 0; + } + return -1; +} diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index f778f471a09a..15a7d727bd6f 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -186,6 +186,7 @@ static struct timer_opts timer_hpet = { .get_offset = get_offset_hpet, .monotonic_clock = monotonic_clock_hpet, .delay = delay_hpet, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_hpet_init = { diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c index d77f22030fe6..4ef20e663498 100644 --- a/arch/i386/kernel/timers/timer_pm.c +++ b/arch/i386/kernel/timers/timer_pm.c @@ -246,6 +246,7 @@ static struct timer_opts timer_pmtmr = { .get_offset = get_offset_pmtmr, .monotonic_clock = monotonic_clock_pmtmr, .delay = delay_pmtmr, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_pmtmr_init = { diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 180444d87824..27f08ae9120b 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -572,6 +572,7 @@ static struct timer_opts timer_tsc = { .get_offset = get_offset_tsc, .monotonic_clock = monotonic_clock_tsc, .delay = delay_tsc, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_tsc_init = { diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index aed61a668a1b..33a873a3c223 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SMP #include @@ -19,6 +20,12 @@ int x86_udelay_tsc = 0; /* Delay via TSC */ +int read_current_timer(unsigned long *timer_value) +{ + rdtscll(*timer_value); + return 0; +} + void __delay(unsigned long loops) { unsigned bclock, now; diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index c34709849839..dcf1e07db08a 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -22,6 +22,7 @@ struct timer_opts { unsigned long (*get_offset)(void); unsigned long long (*monotonic_clock)(void); void (*delay)(unsigned long); + unsigned long (*read_timer)(void); }; struct init_timer_opts { @@ -52,6 +53,7 @@ extern struct init_timer_opts timer_cyclone_init; #endif extern unsigned long calibrate_tsc(void); +extern unsigned long read_timer_tsc(void); extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); #ifdef CONFIG_HPET_TIMER diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h index b41e484c3445..e370f907bd39 100644 --- a/include/asm-i386/timex.h +++ b/include/asm-i386/timex.h @@ -49,4 +49,7 @@ static inline cycles_t get_cycles (void) extern unsigned long cpu_khz; +extern int read_current_timer(unsigned long *timer_value); +#define ARCH_HAS_READ_CURRENT_TIMER 1 + #endif diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index 34f31a18f90b..24ecf6a637cb 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -26,6 +26,9 @@ static inline cycles_t get_cycles (void) extern unsigned int cpu_khz; +extern int read_current_timer(unsigned long *timer_value); +#define ARCH_HAS_READ_CURRENT_TIMER 1 + extern struct vxtime_data vxtime; #endif diff --git a/init/calibrate.c b/init/calibrate.c index c698e04a3dbe..d206c7548fe6 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -8,6 +8,8 @@ #include #include +#include + static unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -17,6 +19,92 @@ static int __init lpj_setup(char *str) __setup("lpj=", lpj_setup); +#ifdef ARCH_HAS_READ_CURRENT_TIMER + +/* This routine uses the read_current_timer() routine and gets the + * loops per jiffy directly, instead of guessing it using delay(). + * Also, this code tries to handle non-maskable asynchronous events + * (like SMIs) + */ +#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) +#define MAX_DIRECT_CALIBRATION_RETRIES 5 + +static unsigned long __devinit calibrate_delay_direct(void) +{ + unsigned long pre_start, start, post_start; + unsigned long pre_end, end, post_end; + unsigned long start_jiffies; + unsigned long tsc_rate_min, tsc_rate_max; + unsigned long good_tsc_sum = 0; + unsigned long good_tsc_count = 0; + int i; + + if (read_current_timer(&pre_start) < 0 ) + return 0; + + /* + * A simple loop like + * while ( jiffies < start_jiffies+1) + * start = read_current_timer(); + * will not do. As we don't really know whether jiffy switch + * happened first or timer_value was read first. And some asynchronous + * event can happen between these two events introducing errors in lpj. + * + * So, we do + * 1. pre_start <- When we are sure that jiffy switch hasn't happened + * 2. check jiffy switch + * 3. start <- timer value before or after jiffy switch + * 4. post_start <- When we are sure that jiffy switch has happened + * + * Note, we don't know anything about order of 2 and 3. + * Now, by looking at post_start and pre_start difference, we can + * check whether any asynchronous event happened or not + */ + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + pre_start = 0; + read_current_timer(&start); + start_jiffies = jiffies; + while (jiffies <= (start_jiffies + 1)) { + pre_start = start; + read_current_timer(&start); + } + read_current_timer(&post_start); + + pre_end = 0; + end = post_start; + while (jiffies <= + (start_jiffies + 1 + DELAY_CALIBRATION_TICKS)) { + pre_end = end; + read_current_timer(&end); + } + read_current_timer(&post_end); + + tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS; + tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS; + + /* + * If the upper limit and lower limit of the tsc_rate is + * >= 12.5% apart, redo calibration. + */ + if (pre_start != 0 && pre_end != 0 && + (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) { + good_tsc_count++; + good_tsc_sum += tsc_rate_max; + } + } + + if (good_tsc_count) + return (good_tsc_sum/good_tsc_count); + + printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); + return 0; +} +#else +static unsigned long __devinit calibrate_delay_direct(void) {return 0;} +#endif + /* * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little @@ -35,6 +123,12 @@ void __devinit calibrate_delay(void) "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); + } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { + printk("Calibrating delay using timer specific routine.. "); + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100, + loops_per_jiffy); } else { loops_per_jiffy = (1<<12); From 4a35293667f8691b45fdb72770f087c086fb9702 Mon Sep 17 00:00:00 2001 From: Hirokazu Takata Date: Thu, 23 Jun 2005 00:08:14 -0700 Subject: [PATCH 067/199] [PATCH] m32r: build fix for asm-m32r/topology.h Use asm-generic/topology.h to fix yet another pcibus_to_node() build error. Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-m32r/topology.h | 44 +------------------------------------ 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/include/asm-m32r/topology.h b/include/asm-m32r/topology.h index 299a89d91bde..d607eb32bd7e 100644 --- a/include/asm-m32r/topology.h +++ b/include/asm-m32r/topology.h @@ -1,48 +1,6 @@ -/* - * linux/include/asm-generic/topology.h - * - * Written by: Matthew Dobson, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to - */ #ifndef _ASM_M32R_TOPOLOGY_H #define _ASM_M32R_TOPOLOGY_H -/* Other architectures wishing to use this simple topology API should fill - in the below functions as appropriate in their own file. */ - -#define cpu_to_node(cpu) (0) - -#ifndef parent_node -#define parent_node(node) (0) -#endif -#ifndef node_to_cpumask -#define node_to_cpumask(node) (cpu_online_map) -#endif -#ifndef node_to_first_cpu -#define node_to_first_cpu(node) (0) -#endif -#ifndef pcibus_to_cpumask -#define pcibus_to_cpumask(bus) (cpu_online_map) -#endif +#include #endif /* _ASM_M32R_TOPOLOGY_H */ From e164f5573bef0e6caf53519719cf0228c9c15ce3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:17 -0700 Subject: [PATCH 068/199] [PATCH] ppc64: pcibus_to_node fix asm-generic/topology.h must also be included if CONFIG_NUMA is set in order to provide the fall back pcibus_to_node function. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/topology.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/asm-ppc64/topology.h b/include/asm-ppc64/topology.h index d58d9dd79998..fcdcfd26a26b 100644 --- a/include/asm-ppc64/topology.h +++ b/include/asm-ppc64/topology.h @@ -59,10 +59,8 @@ static inline int node_to_first_cpu(int node) .nr_balance_failed = 0, \ } -#else /* !CONFIG_NUMA */ +#endif /* CONFIG_NUMA */ #include -#endif /* CONFIG_NUMA */ - #endif /* _ASM_PPC64_TOPOLOGY_H */ From 8c5a09082f4e61a176382e96a831a0636b918602 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:18 -0700 Subject: [PATCH 069/199] [PATCH] x86/x86_64: pcibus_to_node Define pcibus_to_node to be able to figure out which NUMA node contains a given PCI device. This defines pcibus_to_node(bus) in include/linux/topology.h and adjusts the macros for i386 and x86_64 that already provided a way to determine the cpumask of a pci device. x86_64 was changed to not build an array of cpumasks anymore. Instead an array of nodes is build which can be used to generate the cpumask via node_to_cpumask. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mpparse.c | 4 +++- arch/x86_64/pci/k8-bus.c | 16 +--------------- include/asm-generic/topology.h | 9 ++++++++- include/asm-i386/topology.h | 8 ++------ include/asm-x86_64/topology.h | 14 +++----------- 5 files changed, 17 insertions(+), 34 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61a63be6b294..ed6a5588146d 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -45,7 +46,8 @@ int acpi_found_madt; int apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; +unsigned char pci_bus_to_node [256]; +EXPORT_SYMBOL(pci_bus_to_node); static int mp_current_pci_id = 0; /* I/O APIC entries */ diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 62349c78db57..7e7d0c2a0025 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -53,25 +53,11 @@ fill_mp_bus_to_cpumask(void) for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) - pci_bus_to_cpumask[j] = - node_to_cpumask(NODE_ID(nid)); + pci_bus_to_node[j] = NODE_ID(nid); } } } - /* quick sanity check */ - printed = 0; - for (i = 0; i < 256; i++) { - if (cpus_empty(pci_bus_to_cpumask[i])) { - pci_bus_to_cpumask[i] = CPU_MASK_ALL; - if (printed) - continue; - printk(KERN_ERR - "k8-bus.c: some busses have empty cpu mask\n"); - printed = 1; - } - } - return 0; } diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index ec96e8b0f190..5d9d70cd17fc 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -41,8 +41,15 @@ #ifndef node_to_first_cpu #define node_to_first_cpu(node) (0) #endif +#ifndef pcibus_to_node +#define pcibus_to_node(node) (-1) +#endif + #ifndef pcibus_to_cpumask -#define pcibus_to_cpumask(bus) (cpu_online_map) +#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ + CPU_MASK_ALL : \ + node_to_cpumask(pcibus_to_node(bus)) \ + ) #endif #endif /* _ASM_GENERIC_TOPOLOGY_H */ diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index 98f9e6850cba..6d0f67507b21 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -60,12 +60,8 @@ static inline int node_to_first_cpu(int node) return first_cpu(mask); } -/* Returns the number of the node containing PCI bus number 'busnr' */ -static inline cpumask_t __pcibus_to_cpumask(int busnr) -{ - return node_to_cpumask(mp_bus_id_to_node[busnr]); -} -#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) +#define pcibus_to_node(bus) mp_bus_id_to_node[(bus)->number] +#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus)) /* sched_domains SD_NODE_INIT for NUMAQ machines */ #define SD_NODE_INIT (struct sched_domain) { \ diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index da21573ec731..8f77e9f6bc23 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -13,8 +13,8 @@ extern cpumask_t cpu_online_map; extern unsigned char cpu_to_node[]; +extern unsigned char pci_bus_to_node[]; extern cpumask_t node_to_cpumask[]; -extern cpumask_t pci_bus_to_cpumask[]; #ifdef CONFIG_ACPI_NUMA extern int __node_distance(int, int); @@ -26,16 +26,8 @@ extern int __node_distance(int, int); #define parent_node(node) (node) #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) #define node_to_cpumask(node) (node_to_cpumask[node]) - -static inline cpumask_t __pcibus_to_cpumask(int bus) -{ - cpumask_t busmask = pci_bus_to_cpumask[bus]; - cpumask_t online = cpu_online_map; - cpumask_t res; - cpus_and(res, busmask, online); - return res; -} -#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) +#define pcibus_to_node(bus) pci_bus_to_node[(bus)->number] +#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus)); /* sched_domains SD_NODE_INIT for x86_64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:19 -0700 Subject: [PATCH 070/199] [PATCH] NUMA aware block device control structure allocation Patch to allocate the control structures for for ide devices on the node of the device itself (for NUMA systems). The patch depends on the Slab API change patch by Manfred and me (in mm) and the pcidev_to_node patch that I posted today. Does some realignment too. Signed-off-by: Justin M. Forbes Signed-off-by: Christoph Lameter Signed-off-by: Pravin Shelar Signed-off-by: Shobhit Dayal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/as-iosched.c | 8 +++++--- drivers/block/deadline-iosched.c | 8 +++++--- drivers/block/genhd.c | 13 ++++++++++--- drivers/block/ll_rw_blk.c | 30 +++++++++++++++++++++++------- drivers/ide/ide-disk.c | 3 ++- drivers/ide/ide-probe.c | 8 +++++--- include/linux/blkdev.h | 6 +++++- include/linux/genhd.h | 1 + include/linux/ide.h | 2 +- include/linux/mempool.h | 11 ++++++++--- mm/mempool.c | 17 ++++++++++++----- 11 files changed, 77 insertions(+), 30 deletions(-) diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index 638db06de2be..3410b4d294b9 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -1871,20 +1871,22 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) if (!arq_pool) return -ENOMEM; - ad = kmalloc(sizeof(*ad), GFP_KERNEL); + ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node); if (!ad) return -ENOMEM; memset(ad, 0, sizeof(*ad)); ad->q = q; /* Identify what queue the data belongs to */ - ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL); + ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES, + GFP_KERNEL, q->node); if (!ad->hash) { kfree(ad); return -ENOMEM; } - ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool); + ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, arq_pool, q->node); if (!ad->arq_pool) { kfree(ad->hash); kfree(ad); diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c index 7f79f3dd0165..4bc2fea73273 100644 --- a/drivers/block/deadline-iosched.c +++ b/drivers/block/deadline-iosched.c @@ -711,18 +711,20 @@ static int deadline_init_queue(request_queue_t *q, elevator_t *e) if (!drq_pool) return -ENOMEM; - dd = kmalloc(sizeof(*dd), GFP_KERNEL); + dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return -ENOMEM; memset(dd, 0, sizeof(*dd)); - dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL); + dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES, + GFP_KERNEL, q->node); if (!dd->hash) { kfree(dd); return -ENOMEM; } - dd->drq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, drq_pool); + dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, drq_pool, q->node); if (!dd->drq_pool) { kfree(dd->hash); kfree(dd); diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c index 53f7d846b747..43805e4d31e9 100644 --- a/drivers/block/genhd.c +++ b/drivers/block/genhd.c @@ -582,10 +582,16 @@ struct seq_operations diskstats_op = { .show = diskstats_show }; - struct gendisk *alloc_disk(int minors) { - struct gendisk *disk = kmalloc(sizeof(struct gendisk), GFP_KERNEL); + return alloc_disk_node(minors, -1); +} + +struct gendisk *alloc_disk_node(int minors, int node_id) +{ + struct gendisk *disk; + + disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { memset(disk, 0, sizeof(struct gendisk)); if (!init_disk_stats(disk)) { @@ -594,7 +600,7 @@ struct gendisk *alloc_disk(int minors) } if (minors > 1) { int size = (minors - 1) * sizeof(struct hd_struct *); - disk->part = kmalloc(size, GFP_KERNEL); + disk->part = kmalloc_node(size, GFP_KERNEL, node_id); if (!disk->part) { kfree(disk); return NULL; @@ -610,6 +616,7 @@ struct gendisk *alloc_disk(int minors) } EXPORT_SYMBOL(alloc_disk); +EXPORT_SYMBOL(alloc_disk_node); struct kobject *get_disk(struct gendisk *disk) { diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 81fe3a0c1fe7..cd8cf302068c 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * for max sense size @@ -1645,7 +1646,8 @@ static int blk_init_free_list(request_queue_t *q) init_waitqueue_head(&rl->wait[WRITE]); init_waitqueue_head(&rl->drain); - rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, request_cachep, q->node); if (!rl->rq_pool) return -ENOMEM; @@ -1657,8 +1659,15 @@ static int __make_request(request_queue_t *, struct bio *); request_queue_t *blk_alloc_queue(int gfp_mask) { - request_queue_t *q = kmem_cache_alloc(requestq_cachep, gfp_mask); + return blk_alloc_queue_node(gfp_mask, -1); +} +EXPORT_SYMBOL(blk_alloc_queue); + +request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id) +{ + request_queue_t *q; + q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); if (!q) return NULL; @@ -1671,8 +1680,7 @@ request_queue_t *blk_alloc_queue(int gfp_mask) return q; } - -EXPORT_SYMBOL(blk_alloc_queue); +EXPORT_SYMBOL(blk_alloc_queue_node); /** * blk_init_queue - prepare a request queue for use with a block device @@ -1705,13 +1713,22 @@ EXPORT_SYMBOL(blk_alloc_queue); * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ + request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { - request_queue_t *q = blk_alloc_queue(GFP_KERNEL); + return blk_init_queue_node(rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_queue); + +request_queue_t * +blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) +{ + request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!q) return NULL; + q->node = node_id; if (blk_init_free_list(q)) goto out_init; @@ -1754,8 +1771,7 @@ request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) kmem_cache_free(requestq_cachep, q); return NULL; } - -EXPORT_SYMBOL(blk_init_queue); +EXPORT_SYMBOL(blk_init_queue_node); int blk_get_queue(request_queue_t *q) { diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 3302cd8eab4c..d6f934886b04 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -1215,7 +1215,8 @@ static int ide_disk_probe(struct device *dev) if (!idkp) goto failed; - g = alloc_disk(1 << PARTN_BITS); + g = alloc_disk_node(1 << PARTN_BITS, + pcibus_to_node(drive->hwif->pci_dev->bus)); if (!g) goto out_free_idkp; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 5d876f53c697..7df85af75371 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -977,8 +977,9 @@ static int ide_init_queue(ide_drive_t *drive) * limits and LBA48 we could raise it but as yet * do not. */ - - q = blk_init_queue(do_ide_request, &ide_lock); + + q = blk_init_queue_node(do_ide_request, &ide_lock, + pcibus_to_node(drive->hwif->pci_dev->bus)); if (!q) return 1; @@ -1095,7 +1096,8 @@ static int init_irq (ide_hwif_t *hwif) hwgroup->hwif->next = hwif; spin_unlock_irq(&ide_lock); } else { - hwgroup = kmalloc(sizeof(ide_hwgroup_t),GFP_KERNEL); + hwgroup = kmalloc_node(sizeof(ide_hwgroup_t), GFP_KERNEL, + pcibus_to_node(hwif->drives[0].hwif->pci_dev->bus)); if (!hwgroup) goto out_up; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a99b76c5a33..235c3414d268 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -396,6 +396,7 @@ struct request_queue */ unsigned int sg_timeout; unsigned int sg_reserved_size; + int node; struct list_head drain_list; @@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req) /* * Access functions for manipulating queue properties */ +extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn, + spinlock_t *lock, int node_id); extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); @@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int); extern void blk_finish_queue_drain(request_queue_t *); int blk_get_queue(request_queue_t *); -request_queue_t *blk_alloc_queue(int); +request_queue_t *blk_alloc_queue(int gfp_mask); +request_queue_t *blk_alloc_queue_node(int,int); #define blk_put_queue(q) blk_cleanup_queue((q)) /* diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 47dedaf971d6..af26dc718ef6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern void add_partition(struct gendisk *, int, sector_t, sector_t); extern void delete_partition(struct gendisk *, int); +extern struct gendisk *alloc_disk_node(int minors, int node_id); extern struct gendisk *alloc_disk(int minors); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); diff --git a/include/linux/ide.h b/include/linux/ide.h index 336d6e509f59..92129078d4f3 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -917,7 +917,7 @@ typedef struct hwif_s { unsigned dma; void (*led_act)(void *data, int rw); -} ide_hwif_t; +} ____cacheline_maxaligned_in_smp ide_hwif_t; /* * internal ide interrupt handler type diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4a36edf1c974..796220ce47cc 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -20,9 +20,14 @@ typedef struct mempool_s { mempool_free_t *free; wait_queue_head_t wait; } mempool_t; -extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); -extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask); + +extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, int nid); + +extern int mempool_resize(mempool_t *pool, int new_min_nr, + unsigned int __nocast gfp_mask); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask); extern void mempool_free(void *element, mempool_t *pool); diff --git a/mm/mempool.c b/mm/mempool.c index c9f3d4620428..920c8c3ab1b8 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool) * functions might sleep - as long as the mempool_alloc function is not called * from IRQ contexts. */ -mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, +mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - mempool_t *pool; + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); +} +EXPORT_SYMBOL(mempool_create); - pool = kmalloc(sizeof(*pool), GFP_KERNEL); +mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, int node_id) +{ + mempool_t *pool; + pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id); if (!pool) return NULL; memset(pool, 0, sizeof(*pool)); - pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + pool->elements = kmalloc_node(min_nr * sizeof(void *), + GFP_KERNEL, node_id); if (!pool->elements) { kfree(pool); return NULL; @@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, } return pool; } -EXPORT_SYMBOL(mempool_create); +EXPORT_SYMBOL(mempool_create_node); /** * mempool_resize - resize an existing memory pool From 7c1def1652c6c1a95eafca2991baace34afaed0f Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 23 Jun 2005 00:08:21 -0700 Subject: [PATCH 071/199] [PATCH] i386: never block forced SIGSEGV This problem was first noticed on PPC and has already been fixed there. But the exact same issue applies to other platforms in the same way. The signal blocking for sa_mask and the handled signal takes place after the handler setup. When the stack is bogus, the handler setup forces a SIGSEGV. But then this will be blocked, and returning to user mode will fault again and iterate. This patch fixes the problem by checking whether signal handler setup failed, and not doing the signal-blocking if so. This copies what was done in the ppc code. I think all architectures' signal handler setup code follows this pattern and needs the change. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/signal.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index ea46d028af08..344400787c39 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -346,8 +346,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) extern void __user __kernel_sigreturn; extern void __user __kernel_rt_sigreturn; -static void setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs) +static int setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) { void __user *restorer; struct sigframe __user *frame; @@ -429,13 +429,14 @@ static void setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->eip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { void __user *restorer; @@ -522,20 +523,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->eip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs * regs) { + int ret; + /* Are we from a system call? */ if (regs->orig_eax >= 0) { /* If so, check system call restarting.. */ @@ -569,17 +573,19 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); else - setup_frame(sig, ka, oldset, regs); + ret = setup_frame(sig, ka, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } + + return ret; } /* @@ -622,8 +628,7 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) } /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; + return handle_signal(signr, &info, &ka, oldset, regs); } no_signal: From ca05fea6db5259c6d62e517c41d448a4249175f4 Mon Sep 17 00:00:00 2001 From: Natalie Protasevich Date: Thu, 23 Jun 2005 00:08:22 -0700 Subject: [PATCH 072/199] [PATCH] Do not enforce unique IO_APIC_ID check for xAPIC systems (i386) This patch is per Andi's request to remove NO_IOAPIC_CHECK from genapic and use heuristics to prevent unique I/O APIC ID check for systems that don't need it. The patch disables unique I/O APIC ID check for Xeon-based and other platforms that don't use serial APIC bus for interrupt delivery. Andi stated that AMD systems don't need unique IO_APIC_IDs either. Signed-off-by: Natalie Protasevich Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 10 ++++++---- arch/i386/kernel/mpparse.c | 5 ++++- include/asm-i386/genapic.h | 1 - include/asm-i386/mach-bigsmp/mach_apic.h | 2 -- include/asm-i386/mach-default/mach_apic.h | 2 -- include/asm-i386/mach-es7000/mach_apic.h | 2 -- include/asm-i386/mach-generic/mach_apic.h | 1 - include/asm-i386/mach-numaq/mach_apic.h | 2 -- include/asm-i386/mach-summit/mach_apic.h | 2 -- include/asm-i386/mach-visws/mach_apic.h | 2 -- 10 files changed, 10 insertions(+), 19 deletions(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7a324e8b86f9..51f7a5d8721f 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1658,6 +1658,12 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15)) + return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. @@ -1684,10 +1690,6 @@ static void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; } - /* Don't check I/O APIC IDs for some xAPIC systems. They have - * no meaning without the serial APIC bus. */ - if (NO_IOAPIC_CHECK) - continue; /* * Sanity check, is the ID really free? Every APIC in a * system must have a unique ID or we get lots of nice diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 1347ab4939e7..0a061056b828 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -914,7 +914,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15)) + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + else + mp_ioapics[idx].mpc_apicid = id; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index fc813b2e8274..b3783a32abee 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -78,7 +78,6 @@ struct genapic { .int_delivery_mode = INT_DELIVERY_MODE, \ .int_dest_mode = INT_DEST_MODE, \ .no_balance_irq = NO_BALANCE_IRQ, \ - .no_ioapic_check = NO_IOAPIC_CHECK, \ .ESR_DISABLE = esr_disable, \ .apic_destination_logical = APIC_DEST_LOGICAL, \ APICFUNC(apic_id_registered), \ diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h index 2339868270ef..ba936d4daedb 100644 --- a/include/asm-i386/mach-bigsmp/mach_apic.h +++ b/include/asm-i386/mach-bigsmp/mach_apic.h @@ -14,8 +14,6 @@ #define NO_BALANCE_IRQ (1) #define esr_disable (1) -#define NO_IOAPIC_CHECK (0) - static inline int apic_id_registered(void) { return (1); diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h index 627f1cd084ba..3ef6292db780 100644 --- a/include/asm-i386/mach-default/mach_apic.h +++ b/include/asm-i386/mach-default/mach_apic.h @@ -19,8 +19,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (0) #define esr_disable (0) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index ceab2c464b13..b5f3f0d0b2bc 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -38,8 +38,6 @@ static inline cpumask_t target_cpus(void) #define WAKE_SECONDARY_VIA_INIT #endif -#define NO_IOAPIC_CHECK (1) - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { return 0; diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h index ab36d02ebede..b13767a4e934 100644 --- a/include/asm-i386/mach-generic/mach_apic.h +++ b/include/asm-i386/mach-generic/mach_apic.h @@ -5,7 +5,6 @@ #define esr_disable (genapic->ESR_DISABLE) #define NO_BALANCE_IRQ (genapic->no_balance_irq) -#define NO_IOAPIC_CHECK (genapic->no_ioapic_check) #define INT_DELIVERY_MODE (genapic->int_delivery_mode) #define INT_DEST_MODE (genapic->int_dest_mode) #undef APIC_DEST_LOGICAL diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h index e1a04494764a..9d158095da82 100644 --- a/include/asm-i386/mach-numaq/mach_apic.h +++ b/include/asm-i386/mach-numaq/mach_apic.h @@ -17,8 +17,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (1) #define esr_disable (1) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h index 74e9cbc8c01b..3d6d12937e1f 100644 --- a/include/asm-i386/mach-summit/mach_apic.h +++ b/include/asm-i386/mach-summit/mach_apic.h @@ -7,8 +7,6 @@ #define esr_disable (1) #define NO_BALANCE_IRQ (0) -#define NO_IOAPIC_CHECK (1) /* Don't check I/O APIC ID for xAPIC */ - /* In clustered mode, the high nibble of APIC ID is a cluster number. * The low nibble is a 4-bit bitmap. */ #define XAPIC_DEST_CPUS_SHIFT 4 diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h index 4e6cdfb8b091..de438c7147a8 100644 --- a/include/asm-i386/mach-visws/mach_apic.h +++ b/include/asm-i386/mach-visws/mach_apic.h @@ -9,8 +9,6 @@ #define no_balance_irq (0) #define esr_disable (0) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ From 7fbb4f6e6873593a2defb8f66512f55d08d88106 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:08:23 -0700 Subject: [PATCH 073/199] [PATCH] adjust i386 watchdog tick calculation Get the i386 watchdog tick calculation into a state where it can also be used on CPUs with frequencies beyond 4GHz, and it consolidates the calculation into a single place (for potential furture adjustments). Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/nmi.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 2c0ee9c2d020..da6c46d667cb 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -28,8 +28,7 @@ #include #include -#include -#include +#include #include #include "mach_traps.h" @@ -324,6 +323,16 @@ static void clear_msr_range(unsigned int base, unsigned int n) wrmsr(base+i, 0, 0); } +static inline void write_watchdog_counter(const char *descr) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsrl(nmi_perfctr_msr, 0 - count); +} + static void setup_k7_watchdog(void) { unsigned int evntsel; @@ -339,8 +348,7 @@ static void setup_k7_watchdog(void) | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter("K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); @@ -361,8 +369,7 @@ static void setup_p6_watchdog(void) | P6_NMI_EVENT; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0); + write_watchdog_counter("P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); @@ -402,8 +409,7 @@ static int setup_p4_watchdog(void) wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter("P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; @@ -518,7 +524,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) * other P6 variant */ apic_write(APIC_LVTPC, APIC_DM_NMI); } - wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter(NULL); } } From 799d19f6ec5ca2102c61122f5219a17f1c4e961a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:08:24 -0700 Subject: [PATCH 074/199] [PATCH] allow early printk to use more than 25 lines Allow early printk code to take advantage of the full size of the screen, not just the first 25 lines. Signed-off-by: Jan Beulich Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/early_printk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index e3a19e8ebbf8..9631c747c5e3 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,20 +2,24 @@ #include #include #include +#include #include #include /* Simple VGA output */ #ifdef __i386__ +#include #define VGABASE (__ISA_IO_base + 0xb8000) #else +#include #define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif -#define MAX_YPOS 25 -#define MAX_XPOS 80 +#define MAX_YPOS max_ypos +#define MAX_XPOS max_xpos +static int max_ypos = 25, max_xpos = 80; static int current_ypos = 1, current_xpos = 0; static void early_vga_write(struct console *con, const char *str, unsigned n) @@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt) } else if (!strncmp(buf, "ttyS", 4)) { early_serial_init(buf); early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { + } else if (!strncmp(buf, "vga", 3) + && SCREEN_INFO.orig_video_isVGA == 1) { + max_xpos = SCREEN_INFO.orig_video_cols; + max_ypos = SCREEN_INFO.orig_video_lines; early_console = &early_vga_console; } early_console_initialized = 1; From 59121003721a8fad11ee72e646fd9d3076b5679c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:25 -0700 Subject: [PATCH 075/199] [PATCH] i386: Selectable Frequency of the Timer Interrupt Make the timer frequency selectable. The timer interrupt may cause bus and memory contention in large NUMA systems since the interrupt occurs on each processor HZ times per second. Signed-off-by: Christoph Lameter Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 2 ++ arch/x86_64/Kconfig | 2 ++ include/asm-i386/param.h | 4 +++- include/asm-x86_64/param.h | 6 +++-- kernel/Kconfig.hz | 46 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 kernel/Kconfig.hz diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index bfdcedef06e1..d4ae5f9ceae6 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -961,6 +961,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source kernel/Kconfig.hz + endmenu diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 61ed16652347..db259757dc8a 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -402,6 +402,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source kernel/Kconfig.hz + endmenu # diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h index b6440526e42a..fa02e67ea86b 100644 --- a/include/asm-i386/param.h +++ b/include/asm-i386/param.h @@ -1,8 +1,10 @@ +#include + #ifndef _ASMi386_PARAM_H #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h index b707f0568c9e..40b11937180d 100644 --- a/include/asm-x86_64/param.h +++ b/include/asm-x86_64/param.h @@ -1,9 +1,11 @@ +#include + #ifndef _ASMx86_64_PARAM_H #define _ASMx86_64_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ +# define USER_HZ 100 /* .. some user interfaces are in "ticks */ #define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz new file mode 100644 index 000000000000..248e1c396f8b --- /dev/null +++ b/kernel/Kconfig.hz @@ -0,0 +1,46 @@ +# +# Timer Interrupt Frequency Configuration +# + +choice + prompt "Timer frequency" + default HZ_250 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 HZ but 100 HZ may be more + beneficial for servers and NUMA systems that do not need to have + a fast response for user interaction and that may experience bus + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts + per second. + + + config HZ_100 + bool "100 HZ" + help + 100 HZ is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + + config HZ_250 + bool "250 HZ" + help + 250 HZ is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems. + + config HZ_1000 + bool "1000 HZ" + help + 1000 HZ is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + +endchoice + +config HZ + int + default 100 if HZ_100 + default 250 if HZ_250 + default 1000 if HZ_1000 + From b5d23e5b8c7ecd97d32f6ad7680d9909977580a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:27 -0700 Subject: [PATCH 076/199] [PATCH] ia64: Selectable Timer Interrupt Frequency It allows a selectable timer interrupt frequency of 100, 250 and 1000 HZ. Reducing the timer frequency may have important performance benefits on large systems. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 2 ++ include/asm-ia64/param.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 9f6a46caba9b..01b78e7f992e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -161,6 +161,8 @@ config IA64_PAGE_SIZE_64KB endchoice +source kernel/Kconfig.hz + config IA64_BRL_EMU bool depends on ITANIUM diff --git a/include/asm-ia64/param.h b/include/asm-ia64/param.h index 6c6b679b7a9e..5e1e0d2d7baf 100644 --- a/include/asm-ia64/param.h +++ b/include/asm-ia64/param.h @@ -27,7 +27,7 @@ */ # define HZ 32 # else -# define HZ 1024 +# define HZ CONFIG_HZ # endif # define USER_HZ HZ # define CLOCKS_PER_SEC HZ /* frequency at which times() counts */ From c434b7a6aedfe428ad17cd61b21b125a7b7a29ce Mon Sep 17 00:00:00 2001 From: Natalie Protasevich Date: Thu, 23 Jun 2005 00:08:29 -0700 Subject: [PATCH 077/199] [PATCH] x86: avoid wasting IRQs for PCI devices I have submitted the patch for x86_64, this is submission for i386. The patch changes the way IRQs are handed out to PCI devices. Currently, each I/O APIC pin gets associated with an IRQ, no matter if the pin is used or not. This imposes severe limitation on systems that have designs that employ many I/O APICs, only utilizing couple lines of each, such as P64H2 chipset. It is used in ES7000, and currently, there is no way to boot the system with more that 9 I/O APICs. The simple change below allows to boot a system with say 64 (or more) I/O APICs, each providing 1 slot, which otherwise impossible because of the IRQ gaps created for unused lines on each I/O APIC. It does not resolve the problem with number of devices that exceeds number of possible IRQs, but eases up a tension for IRQs on any large system with potentually large number of devices. Signed-off-by: Natalie Protasevich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/mpparse.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 0a061056b828..383a11600d2c 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -1058,11 +1058,20 @@ void __init mp_config_acpi_legacy_irqs (void) } } +#define MAX_GSI_NUM 4096 + int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrups, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; #ifdef CONFIG_ACPI_BUS /* Don't set up the ACPI SCI because it's already set up */ @@ -1097,11 +1106,26 @@ int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) if ((1< Date: Thu, 23 Jun 2005 00:08:29 -0700 Subject: [PATCH 078/199] [PATCH] VIA 82C586B IRQ routing fix According to the VIA 82C586B datasheet (still available from http://gkernel.sourceforge.net/specs/via/586b.pdf.bz2) this chip need a special PIRQ mapping. Signed-off-by: Karsten Keil Signed-off-by: Aleksey Gorelov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/pci/irq.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c index da21b1d07c15..83458f81e661 100644 --- a/arch/i386/pci/irq.c +++ b/arch/i386/pci/irq.c @@ -226,6 +226,24 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i return 1; } +/* + * The VIA pirq rules are nibble-based, like ALI, + * but without the ugly irq number munging. + * However, for 82C586, nibble map is different . + */ +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + return read_config_nybble(router, 0x55, pirqmap[pirq-1]); +} + +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); + return 1; +} + /* * ITE 8330G pirq rules are nibble-based * FIXME: pirqmap may be { 1, 0, 3, 2 }, @@ -512,6 +530,10 @@ static __init int via_router_probe(struct irq_router *r, struct pci_dev *router, switch(device) { case PCI_DEVICE_ID_VIA_82C586_0: + r->name = "VIA"; + r->get = pirq_via586_get; + r->set = pirq_via586_set; + return 1; case PCI_DEVICE_ID_VIA_82C596: case PCI_DEVICE_ID_VIA_82C686: case PCI_DEVICE_ID_VIA_8231: From a9ed8817966dd723754a990f1003264a21b5747e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:08:32 -0700 Subject: [PATCH 079/199] [PATCH] x86: #include asm/uaccess.h in asm/checksum.h csum_and_copy_to_user is static inline and uses VERIFY_WRITE. Patch allows to remove asm/uaccess.h from i386_ksyms.c without dependency surprises. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/checksum.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h index 641342002bcd..f949e44c2a35 100644 --- a/include/asm-i386/checksum.h +++ b/include/asm-i386/checksum.h @@ -3,6 +3,8 @@ #include +#include + /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) From 129f69465b411592247c408f93d7106939223be1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:08:33 -0700 Subject: [PATCH 080/199] [PATCH] Remove i386_ksyms.c, almost. * EXPORT_SYMBOL's moved to other files * #include , where needed * #include's in i386_ksyms.c cleaned up * After copy-paste, redundant due to Makefiles rules preprocessor directives removed: #ifdef CONFIG_FOO EXPORT_SYMBOL(foo); #endif obj-$(CONFIG_FOO) += foo.o * Tiny reformat to fit in 80 columns Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/i386_ksyms.c | 160 +--------------------------------- arch/i386/kernel/i387.c | 3 + arch/i386/kernel/io_apic.c | 3 +- arch/i386/kernel/pci-dma.c | 3 + arch/i386/kernel/process.c | 7 ++ arch/i386/kernel/reboot.c | 5 ++ arch/i386/kernel/setup.c | 20 +++++ arch/i386/kernel/smp.c | 3 + arch/i386/kernel/smpboot.c | 12 +++ arch/i386/kernel/time.c | 4 + arch/i386/kernel/traps.c | 3 + arch/i386/lib/dec_and_lock.c | 2 + arch/i386/lib/delay.c | 6 ++ arch/i386/lib/mmx.c | 5 ++ arch/i386/lib/usercopy.c | 8 +- arch/i386/mm/discontig.c | 3 + arch/i386/mm/highmem.c | 6 ++ arch/i386/mm/init.c | 1 + arch/i386/mm/ioremap.c | 5 +- arch/i386/pci/pcbios.c | 4 +- 20 files changed, 101 insertions(+), 162 deletions(-) diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 903190a4b3ff..180f070d03cb 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -1,97 +1,17 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include - -extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; /* This is definitely a GPL-only symbol */ EXPORT_SYMBOL_GPL(cpu_gdt_table); -#if defined(CONFIG_APM_MODULE) -extern void machine_real_restart(unsigned char *, int); -EXPORT_SYMBOL(machine_real_restart); -extern void default_idle(void); -EXPORT_SYMBOL(default_idle); -#endif - -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -extern unsigned long cpu_khz; -extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -#ifdef CONFIG_DISCONTIGMEM -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(physnode_map); -#endif -#ifdef CONFIG_X86_NUMAQ -EXPORT_SYMBOL(xquad_portio); -#endif -EXPORT_SYMBOL(dump_thread); -EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL_GPL(kernel_fpu_begin); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); -EXPORT_SYMBOL(get_cmos_time); -EXPORT_SYMBOL(cpu_khz); -EXPORT_SYMBOL(apm_info); - EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); -/* Delay loops */ -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); @@ -105,87 +25,11 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__copy_from_user_ll); -EXPORT_SYMBOL(__copy_to_user_ll); -EXPORT_SYMBOL(strnlen_user); - -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - -#ifdef CONFIG_PCI_BIOS -EXPORT_SYMBOL(pcibios_set_irq_routing); -EXPORT_SYMBOL(pcibios_get_irq_routing_table); -#endif - -#ifdef CONFIG_X86_USE_3DNOW -EXPORT_SYMBOL(_mmx_memcpy); -EXPORT_SYMBOL(mmx_clear_page); -EXPORT_SYMBOL(mmx_copy_page); -#endif - -#ifdef CONFIG_X86_HT -EXPORT_SYMBOL(smp_num_siblings); -EXPORT_SYMBOL(cpu_sibling_map); -#endif - #ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_callout_map); +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); - -/* Global SMP stuff */ -EXPORT_SYMBOL(smp_call_function); - -/* TLB flushing */ -EXPORT_SYMBOL(flush_tlb_page); -#endif - -#ifdef CONFIG_X86_IO_APIC -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -#endif - -#ifdef CONFIG_MCA -EXPORT_SYMBOL(machine_id); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - -EXPORT_SYMBOL(register_die_notifier); -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); -#endif - -EXPORT_SYMBOL(__PAGE_KERNEL); - -#ifdef CONFIG_HIGHMEM -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); -#endif - -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) -EXPORT_SYMBOL(ist_info); #endif EXPORT_SYMBOL(csum_partial); diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c index c55e037f08f7..b817168d9c62 100644 --- a/arch/i386/kernel/i387.c +++ b/arch/i386/kernel/i387.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -79,6 +80,7 @@ void kernel_fpu_begin(void) } clts(); } +EXPORT_SYMBOL_GPL(kernel_fpu_begin); void restore_fpu( struct task_struct *tsk ) { @@ -526,6 +528,7 @@ int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) return fpvalid; } +EXPORT_SYMBOL(dump_fpu); int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) { diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 51f7a5d8721f..08540bc4ba3e 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -31,7 +31,7 @@ #include #include #include - +#include #include #include #include @@ -812,6 +812,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); /* * This function currently is only a helper for the i386 smp boot process where diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 4de2e03c7b45..1e51427cc9eb 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -11,6 +11,7 @@ #include #include #include +#include #include struct dma_coherent_mem { @@ -54,6 +55,7 @@ void *dma_alloc_coherent(struct device *dev, size_t size, } return ret; } +EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) @@ -68,6 +70,7 @@ void dma_free_coherent(struct device *dev, size_t size, } else free_pages((unsigned long)vaddr, order); } +EXPORT_SYMBOL(dma_free_coherent); int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 96e3ea6b17c7..3c3f245cca53 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -73,6 +73,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) * Powermanagement idle function, if any.. */ void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) @@ -105,6 +106,9 @@ void default_idle(void) cpu_relax(); } } +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif /* * On SMP it's slightly faster (but much more power-consuming!) @@ -325,6 +329,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } +EXPORT_SYMBOL(kernel_thread); /* * Free current thread data structures etc.. @@ -508,6 +513,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->u_fpvalid = dump_fpu (regs, &dump->i387); } +EXPORT_SYMBOL(dump_thread); /* * Capture the user space registers if the task is not running (in user space) @@ -731,6 +737,7 @@ unsigned long get_wchan(struct task_struct *p) } while (count++ < 16); return 0; } +EXPORT_SYMBOL(get_wchan); /* * sys_alloc_thread_area: get a yet unused TLS descriptor index. diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 6dc27eb70ee7..db912209a8d3 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -2,6 +2,7 @@ * linux/arch/i386/kernel/reboot.c */ +#include #include #include #include @@ -19,6 +20,7 @@ * Power off function, if any */ void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static int reboot_mode; static int reboot_thru_bios; @@ -295,6 +297,9 @@ void machine_real_restart(unsigned char *code, int length) : : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); } +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif void machine_restart(char * __unused) { diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 0d689335c8d6..30406fd0b64c 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -23,6 +23,7 @@ * This file handles the architecture-dependent parts of initialization */ +#include #include #include #include @@ -74,6 +75,7 @@ EXPORT_SYMBOL(efi_enabled); struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; @@ -91,12 +93,18 @@ extern acpi_interrupt_flags acpi_sci_flags; /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; +#ifdef CONFIG_MCA +EXPORT_SYMBOL(machine_id); +#endif unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -108,14 +116,26 @@ static unsigned int highmem_pages = -1; * Setup options */ struct drive_info_struct { char dummy[32]; } drive_info; +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \ + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +EXPORT_SYMBOL(drive_info); +#endif struct screen_info screen_info; +#ifdef CONFIG_VT +EXPORT_SYMBOL(screen_info); +#endif struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); struct sys_desc_table_struct { unsigned short length; unsigned char table[0]; }; struct edid_info edid_info; struct ist_info ist_info; +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +EXPORT_SYMBOL(ist_info); +#endif struct e820map e820; extern void early_cpu_init(void); diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6223c33ac91c..68be7d0c7238 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -452,6 +453,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -547,6 +549,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, return 0; } +EXPORT_SYMBOL(smp_call_function); static void stop_this_cpu (void * dummy) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index bc1bb6919e6a..aaad95e7f879 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -60,6 +60,9 @@ static int __initdata smp_b_stepping; /* Number of siblings per CPU package */ int smp_num_siblings = 1; +#ifdef CONFIG_X86_HT +EXPORT_SYMBOL(smp_num_siblings); +#endif int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ EXPORT_SYMBOL(phys_proc_id); int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */ @@ -67,13 +70,16 @@ EXPORT_SYMBOL(cpu_core_id); /* bitmap of online cpus */ cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map); static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; @@ -885,8 +891,14 @@ static void smp_tune_scheduling (void) static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; +#ifdef CONFIG_X86_NUMAQ +EXPORT_SYMBOL(xquad_portio); +#endif cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +#ifdef CONFIG_X86_HT +EXPORT_SYMBOL(cpu_sibling_map); +#endif cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_core_map); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a0dcb7c87c30..8bc8363fbb4c 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -78,10 +78,12 @@ u64 jiffies_64 = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -324,6 +326,8 @@ unsigned long get_cmos_time(void) return retval; } +EXPORT_SYMBOL(get_cmos_time); + static void sync_cmos_clock(unsigned long dummy); static struct timer_list sync_cmos_timer = diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 83c579e82a81..7f729665d292 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -104,6 +104,7 @@ int register_die_notifier(struct notifier_block *nb) spin_unlock_irqrestore(&die_notifier_lock, flags); return err; } +EXPORT_SYMBOL(register_die_notifier); static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) { @@ -636,11 +637,13 @@ void set_nmi_callback(nmi_callback_t callback) { nmi_callback = callback; } +EXPORT_SYMBOL_GPL(set_nmi_callback); void unset_nmi_callback(void) { nmi_callback = dummy_nmi_callback; } +EXPORT_SYMBOL_GPL(unset_nmi_callback); #ifdef CONFIG_KPROBES fastcall void do_int3(struct pt_regs *regs, long error_code) diff --git a/arch/i386/lib/dec_and_lock.c b/arch/i386/lib/dec_and_lock.c index ab43394dc775..8b81b2524fa6 100644 --- a/arch/i386/lib/dec_and_lock.c +++ b/arch/i386/lib/dec_and_lock.c @@ -8,6 +8,7 @@ */ #include +#include #include int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) @@ -38,3 +39,4 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) spin_unlock(lock); return 0; } +EXPORT_SYMBOL(_atomic_dec_and_lock); diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c index eb0cdfe9280f..c49a6acbee56 100644 --- a/arch/i386/lib/delay.c +++ b/arch/i386/lib/delay.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -47,3 +48,8 @@ void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } + +EXPORT_SYMBOL(__delay); +EXPORT_SYMBOL(__const_udelay); +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(__ndelay); diff --git a/arch/i386/lib/mmx.c b/arch/i386/lib/mmx.c index 01f8b1a2cc84..2afda94dffd3 100644 --- a/arch/i386/lib/mmx.c +++ b/arch/i386/lib/mmx.c @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -397,3 +398,7 @@ void mmx_copy_page(void *to, void *from) else fast_copy_page(to, from); } + +EXPORT_SYMBOL(_mmx_memcpy); +EXPORT_SYMBOL(mmx_clear_page); +EXPORT_SYMBOL(mmx_copy_page); diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c index 51aa2bbb0269..4cf981d70f45 100644 --- a/arch/i386/lib/usercopy.c +++ b/arch/i386/lib/usercopy.c @@ -84,6 +84,7 @@ __strncpy_from_user(char *dst, const char __user *src, long count) __do_strncpy_from_user(dst, src, count, res); return res; } +EXPORT_SYMBOL(__strncpy_from_user); /** * strncpy_from_user: - Copy a NUL terminated string from userspace. @@ -111,7 +112,7 @@ strncpy_from_user(char *dst, const char __user *src, long count) __do_strncpy_from_user(dst, src, count, res); return res; } - +EXPORT_SYMBOL(strncpy_from_user); /* * Zero Userspace @@ -157,6 +158,7 @@ clear_user(void __user *to, unsigned long n) __do_clear_user(to, n); return n; } +EXPORT_SYMBOL(clear_user); /** * __clear_user: - Zero a block of memory in user space, with less checking. @@ -175,6 +177,7 @@ __clear_user(void __user *to, unsigned long n) __do_clear_user(to, n); return n; } +EXPORT_SYMBOL(__clear_user); /** * strlen_user: - Get the size of a string in user space. @@ -218,6 +221,7 @@ long strnlen_user(const char __user *s, long n) :"cc"); return res & mask; } +EXPORT_SYMBOL(strnlen_user); #ifdef CONFIG_X86_INTEL_USERCOPY static unsigned long @@ -570,6 +574,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n = __copy_user_intel(to, from, n); return n; } +EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) @@ -581,6 +586,7 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n) n = __copy_user_zeroing_intel(to, from, n); return n; } +EXPORT_SYMBOL(__copy_from_user_ll); /** * copy_to_user: - Copy a block of data into user space. diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 0efeb96ba5d4..f429c871e845 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -29,12 +29,14 @@ #include #include #include +#include #include #include #include #include struct pglist_data *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); bootmem_data_t node0_bdata; /* @@ -63,6 +65,7 @@ unsigned long node_end_pfn[MAX_NUMNODES]; * physnode_map[8- ] = -1; */ s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; +EXPORT_SYMBOL(physnode_map); void memory_present(int nid, unsigned long start, unsigned long end) { diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index fc4c4cad4e98..4b7aaf99d7ea 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -1,4 +1,5 @@ #include +#include void *kmap(struct page *page) { @@ -87,3 +88,8 @@ struct page *kmap_atomic_to_page(void *ptr) return pte_page(*pte); } +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_to_page); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 48ebfab77a3c..3672e2ef51ae 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -295,6 +295,7 @@ static void __init set_highmem_pages_init(int bad_ppro) #endif /* CONFIG_HIGHMEM */ unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +EXPORT_SYMBOL(__PAGE_KERNEL); unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; #ifdef CONFIG_NUMA diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index ab542792b27b..d393eefc7052 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -165,7 +166,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l } return (void __iomem *) (offset + (char __iomem *)addr); } - +EXPORT_SYMBOL(__ioremap); /** * ioremap_nocache - map bus memory into CPU space @@ -222,6 +223,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) return p; } +EXPORT_SYMBOL(ioremap_nocache); void iounmap(volatile void __iomem *addr) { @@ -255,6 +257,7 @@ void iounmap(volatile void __iomem *addr) write_unlock(&vmlist_lock); kfree(p); } +EXPORT_SYMBOL(iounmap); void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) { diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c index 141421b673b0..b9d65f0bc2d1 100644 --- a/arch/i386/pci/pcbios.c +++ b/arch/i386/pci/pcbios.c @@ -4,6 +4,7 @@ #include #include +#include #include "pci.h" #include "pci-functions.h" @@ -456,7 +457,7 @@ struct irq_routing_table * __devinit pcibios_get_irq_routing_table(void) free_page(page); return rt; } - +EXPORT_SYMBOL(pcibios_get_irq_routing_table); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq) { @@ -473,6 +474,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq) "S" (&pci_indirect)); return !(ret & 0xff00); } +EXPORT_SYMBOL(pcibios_set_irq_routing); static int __init pci_pcbios_init(void) { From a3a255e744dfa672e741dc24306491139d0de2d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jun 2005 00:08:34 -0700 Subject: [PATCH 081/199] [PATCH] x86: cpu_khz type fix x86_64's cpu_khz is unsigned int and there is no reason why x86 needs to use unsigned long. So make cpu_khz unsigned int on x86 as well. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/proc.c | 2 +- arch/i386/kernel/smpboot.c | 2 +- arch/i386/kernel/time.c | 2 +- arch/i386/kernel/timers/common.c | 3 ++- arch/i386/kernel/timers/timer_hpet.c | 2 +- arch/i386/kernel/timers/timer_tsc.c | 7 ++++--- include/asm-i386/timex.h | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 7323c19f354e..8bd77d948a84 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -86,7 +86,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: unknown\n"); if ( cpu_has(c, X86_FEATURE_TSC) ) { - seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", cpu_khz / 1000, (cpu_khz % 1000)); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index aaad95e7f879..c20d96d5c15c 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -205,7 +205,7 @@ static void __init synchronize_tsc_bp (void) unsigned long long t0; unsigned long long sum, avg; long long delta; - unsigned long one_usec; + unsigned int one_usec; int buggy = 0; printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 8bc8363fbb4c..e68d9fdb0759 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -77,7 +77,7 @@ u64 jiffies_64 = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); -unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); extern unsigned long wall_jiffies; diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c index b38cc0d0c71a..37353bd31803 100644 --- a/arch/i386/kernel/timers/common.c +++ b/arch/i386/kernel/timers/common.c @@ -163,7 +163,8 @@ void init_cpu_khz(void) :"=a" (cpu_khz), "=d" (edx) :"r" (tsc_quotient), "0" (eax), "1" (edx)); - printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); } } } diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index 15a7d727bd6f..d766e0963ac1 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -158,7 +158,7 @@ static int __init init_hpet(char* override) { unsigned long eax=0, edx=1000; ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, eax, edx); - printk("Detected %lu.%03lu MHz processor.\n", + printk("Detected %u.%03u MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); } set_cyc2ns_scale(cpu_khz/1000); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 27f08ae9120b..54c36b182021 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -256,7 +256,7 @@ static unsigned long loops_per_jiffy_ref = 0; #ifndef CONFIG_SMP static unsigned long fast_gettimeoffset_ref = 0; -static unsigned long cpu_khz_ref = 0; +static unsigned int cpu_khz_ref = 0; #endif static int @@ -323,7 +323,7 @@ static inline void cpufreq_delayed_get(void) { return; } int recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; + unsigned int cpu_khz_old = cpu_khz; if (cpu_has_tsc) { init_cpu_khz(); @@ -534,7 +534,8 @@ static int __init init_tsc(char* override) :"=a" (cpu_khz), "=d" (edx) :"r" (tsc_quotient), "0" (eax), "1" (edx)); - printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); } set_cyc2ns_scale(cpu_khz/1000); return 0; diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h index e370f907bd39..292b5a68f627 100644 --- a/include/asm-i386/timex.h +++ b/include/asm-i386/timex.h @@ -47,7 +47,7 @@ static inline cycles_t get_cycles (void) return ret; } -extern unsigned long cpu_khz; +extern unsigned int cpu_khz; extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 From c92c6ffdb16990872acf4ce8b24f82f98fcbbb68 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jun 2005 00:08:35 -0700 Subject: [PATCH 082/199] [PATCH] mtrr size-and-base debugging Consolidate the mtrr sanity checking, add a dump_stack(). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/mtrr/main.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index e1c2042b9b7e..d66b09e0c820 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -375,6 +375,19 @@ int mtrr_add_page(unsigned long base, unsigned long size, return error; } +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING + "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG + "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + /** * mtrr_add - Add a memory type region * @base: Physical base address of region @@ -415,11 +428,8 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + if (mtrr_check(base, size)) return -EINVAL; - } return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } @@ -511,11 +521,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) int mtrr_del(int reg, unsigned long base, unsigned long size) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + if (mtrr_check(base, size)) return -EINVAL; - } return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } From c0a88c987878e533fc21fbf684198021a3b2c279 Mon Sep 17 00:00:00 2001 From: Alexander Nyberg Date: Thu, 23 Jun 2005 00:08:35 -0700 Subject: [PATCH 083/199] [PATCH] x86_64: i8259.c iso99 structure initialization Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/i8259.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 7873d9ba8814..19eafa0aa95c 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq) } static struct hw_interrupt_type i8259A_irq_type = { - "XT-PIC", - startup_8259A_irq, - shutdown_8259A_irq, - enable_8259A_irq, - disable_8259A_irq, - mask_and_ack_8259A, - end_8259A_irq, - NULL + .typename = "XT-PIC", + .startup = startup_8259A_irq, + .shutdown = shutdown_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, + .ack = mask_and_ack_8259A, + .end = end_8259A_irq, }; /* From a3a00751ad8970c13d0563c2e92ee68c655a8e6b Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 23 Jun 2005 00:08:36 -0700 Subject: [PATCH 084/199] [PATCH] x86_64: fix hpet for systems that don't support legacy replacement Currently the x86-64 HPET code assumes the entire HPET implementation from the spec is present. This breaks on boxes that do not implement the optional legacy timer replacement functionality portion of the spec. This patch fixes this issue, allowing x86-64 systems that cannot use the HPET for the timer interrupt and RTC to still use the HPET as a time source. I've tested this patch on a system systems without HPET, with HPET but without legacy timer replacement, as well as HPET with legacy timer replacement. This version adds a minor check to cap the HPET counter value in gettimeoffset_hpet to avoid possible time inconsistencies. Please ignore the A2 version I sent to you earlier. Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/time.c | 42 ++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index fb8c809b4cd9..66bf6ddeb0c3 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -64,6 +64,7 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ +static int hpet_use_timer; unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; @@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void) static inline unsigned int do_gettimeoffset_hpet(void) { - return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; + /* cap counter read to one tick to avoid inconsistencies */ + unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; + return (min(counter,hpet_tick) * vxtime.quot) >> 32; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; - this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); @@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) { + if (vxtime.hpet_address) + offset = hpet_readl(HPET_COUNTER); + + if (hpet_use_timer) { + /* if we're using the hpet timer functionality, + * we can more accurately know the counter value + * when the timer interrupt occured. + */ offset = hpet_readl(HPET_T0_CMP) - hpet_tick; delay = hpet_readl(HPET_COUNTER) - offset; } else { @@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick) * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, * and period also hpet_tick. */ - - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ - + hpet_writel(hpet_tick, HPET_T0_CMP); + hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + cfg |= HPET_CFG_LEGACY; + } /* * Go! */ - cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; @@ -834,8 +845,7 @@ static int hpet_init(void) id = hpet_readl(HPET_ID); - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || - !(id & HPET_ID_LEGSUP)) + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) return -1; hpet_period = hpet_readl(HPET_PERIOD); @@ -845,6 +855,8 @@ static int hpet_init(void) hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / hpet_period; + hpet_use_timer = (id & HPET_ID_LEGSUP); + return hpet_timer_stop_set_go(hpet_tick); } @@ -901,9 +913,11 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) { + if (!hpet_init()) vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + + if (hpet_use_timer) { cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; #ifdef CONFIG_X86_PM_TIMER @@ -968,7 +982,7 @@ void __init time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; if (vxtime.hpet_address && notsc) { - timetype = "HPET"; + timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; vxtime.mode = VXTIME_HPET; do_gettimeoffset = do_gettimeoffset_hpet; @@ -983,7 +997,7 @@ void __init time_init_gtod(void) printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); #endif } else { - timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; + timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } From 0928d6ef7f204979749fb241a90a04a35dae133a Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 23 Jun 2005 00:08:37 -0700 Subject: [PATCH 085/199] [PATCH] x86_64: never block forced SIGSEGV This is the x86_64 version of the signal fix I just posted for i386. This problem was first noticed on PPC and has already been fixed there. But the exact same issue applies to other platforms in the same way. The signal blocking for sa_mask and the handled signal takes place after the handler setup. When the stack is bogus, the handler setup forces a SIGSEGV. But then this will be blocked, and returning to user mode will fault again and iterate. This patch fixes the problem by checking whether signal handler setup failed, and not doing the signal-blocking if so. This copies what was done in the ppc code. I think all architectures' signal handler setup code follows this pattern and needs the change. Signed-off-by: Roland McGrath Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/ia32_signal.c | 15 ++++++++------- arch/x86_64/kernel/signal.c | 26 +++++++++++++++----------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index fbd09b5126ce..66e2821533db 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -428,8 +428,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) return (void __user *)((rsp - frame_size) & -8UL); } -void ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs * regs) { struct sigframe __user *frame; int err = 0; @@ -514,14 +514,15 @@ void ia32_setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; int err = 0; @@ -613,9 +614,9 @@ void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } - diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 429c0269dc4e..3c5f30893715 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -34,9 +34,9 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs); -void ia32_setup_frame(int sig, struct k_sigaction *ka, +int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); asmlinkage long @@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(rsp - size, 16); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; @@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + int ret; + #ifdef DEBUG_SIG printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, @@ -384,20 +387,22 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { if (ka->sa.sa_flags & SA_SIGINFO) - ia32_setup_rt_frame(sig, ka, info, oldset, regs); + ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); else - ia32_setup_frame(sig, ka, oldset, regs); + ret = ia32_setup_frame(sig, ka, oldset, regs); } else #endif - setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } + + return ret; } /* @@ -437,8 +442,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; + return handle_signal(signr, &info, &ka, oldset, regs); } no_signal: From 32ecd42b6f94d3ee320a22827b46bd19ccf924e5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:08:38 -0700 Subject: [PATCH 086/199] [PATCH] eliminate duplicate rdpmc definition Eliminate duplicate definition of rdpmc in x86-64's mtrr.h. Signed-off-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/msr.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h index 513e52c71821..bc700232728d 100644 --- a/include/asm-x86_64/msr.h +++ b/include/asm-x86_64/msr.h @@ -57,11 +57,6 @@ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) -#define rdpmc(counter,low,high) \ - __asm__ __volatile__("rdpmc" \ - : "=a" (low), "=d" (high) \ - : "c" (counter)) - #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) #define rdpmc(counter,low,high) \ From 701067c4661ebcdc155cc8f696acb24c016c058b Mon Sep 17 00:00:00 2001 From: Natalie Protasevich Date: Thu, 23 Jun 2005 00:08:41 -0700 Subject: [PATCH 087/199] [PATCH] x86_64: avoid wasting IRQs I suggest to change the way IRQs are handed out to PCI devices. Currently, each I/O APIC pin gets associated with an IRQ, no matter if the pin is used or not. It is expected that each pin can potentually be engaged by a device inserted into the corresponding PCI slot. However, this imposes severe limitation on systems that have designs that employ many I/O APICs, only utilizing couple lines of each, such as P64H2 chipset. It is used in ES7000, and currently, there is no way to boot the system with more that 9 I/O APICs. The simple change below allows to boot a system with say 64 (or more) I/O APICs, each providing 1 slot, which otherwise impossible because of the IRQ gaps created for unused lines on each I/O APIC. It does not resolve the problem with number of devices that exceeds number of possible IRQs, but eases up a tension for IRQs on any large system with potentually large number of devices. I only implemented this for the ACPI boot, since if the system is this big and using newer chipsets it is probably (better be!) an ACPI based system :). The change is completely "mechanical" and does not alter any internal structures or interrupt model/implementation. The patch works for both i386 and x86_64 archs. It works with MSIs just fine, and should not intervene with implementations like shared vectors, when they get worked out and incorporated. To illustrate, below is the interrupt distribution for 2-cell ES7000 with 20 I/O APICs, and an Ethernet card in the last slot, which should be eth1 and which was not configured because its IRQ exceeded allowable number (it actially turned out huge - 480!): zorro-tb2:~ # cat /proc/interrupts CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 0: 65716 30012 30007 30002 30009 30010 30010 30010 IO-APIC-edge timer 4: 373 0 725 280 0 0 0 0 IO-APIC-edge serial 8: 0 0 0 0 0 0 0 0 IO-APIC-edge rtc 9: 0 0 0 0 0 0 0 0 IO-APIC-level acpi 14: 39 3 0 0 0 0 0 0 IO-APIC-edge ide0 16: 108 13 0 0 0 0 0 0 IO-APIC-level uhci_hcd:usb1 18: 0 0 0 0 0 0 0 0 IO-APIC-level uhci_hcd:usb3 19: 15 0 0 0 0 0 0 0 IO-APIC-level uhci_hcd:usb2 23: 3 0 0 0 0 0 0 0 IO-APIC-level ehci_hcd:usb4 96: 4240 397 18 0 0 0 0 0 IO-APIC-level aic7xxx 97: 15 0 0 0 0 0 0 0 IO-APIC-level aic7xxx 192: 847 0 0 0 0 0 0 0 IO-APIC-level eth0 NMI: 0 0 0 0 0 0 0 0 LOC: 273423 274528 272829 274228 274092 273761 273827 273694 ERR: 7 MIS: 0 Even though the system doesn't have that many devices, some don't get enabled only because of IRQ numbering model. This is the IRQ picture after the patch was applied: zorro-tb2:~ # cat /proc/interrupts CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 0: 44169 10004 10004 10001 10004 10003 10004 6135 IO-APIC-edge timer 4: 345 0 0 0 0 244 0 0 IO-APIC-edge serial 8: 0 0 0 0 0 0 0 0 IO-APIC-edge rtc 9: 0 0 0 0 0 0 0 0 IO-APIC-level acpi 14: 39 0 3 0 0 0 0 0 IO-APIC-edge ide0 17: 4425 0 9 0 0 0 0 0 IO-APIC-level aic7xxx 18: 15 0 0 0 0 0 0 0 IO-APIC-level aic7xxx, uhci_hcd:usb3 21: 231 0 0 0 0 0 0 0 IO-APIC-level uhci_hcd:usb1 22: 26 0 0 0 0 0 0 0 IO-APIC-level uhci_hcd:usb2 23: 3 0 0 0 0 0 0 0 IO-APIC-level ehci_hcd:usb4 24: 348 0 0 0 0 0 0 0 IO-APIC-level eth0 25: 6 192 0 0 0 0 0 0 IO-APIC-level eth1 NMI: 0 0 0 0 0 0 0 0 LOC: 107981 107636 108899 108698 108489 108326 108331 108254 ERR: 7 MIS: 0 Not only we see the card in the last I/O APIC, but we are not even close to using up available IRQs, since we didn't waste any. Signed-off-by: Natalie Protasevich Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mpparse.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index ed6a5588146d..9c5aa2a790c7 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -906,11 +906,20 @@ void __init mp_config_acpi_legacy_irqs (void) return; } +#define MAX_GSI_NUM 4096 + int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, to the IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -945,11 +954,21 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) if ((1< Date: Thu, 23 Jun 2005 00:08:42 -0700 Subject: [PATCH 088/199] [PATCH] xen: x86: add macro for debugreg Add 2 macros to set and get debugreg on x86. This is useful for Xen because it will need only to redefine each macro to a hypervisor call. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/processor.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 359bb0151742..c76c50e96225 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -501,12 +501,16 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa } while (0) /* - * This special macro can be used to load a debugging register + * These special macros can be used to get or set a debugging register */ -#define loaddebug(thread,register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" ((thread)->debugreg[register])) +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + /* Forward declaration, a strange C thing */ struct task_struct; From 1cc6f12e03ebc064b74161c684f987284ce9d0cc Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:43 -0700 Subject: [PATCH 089/199] [PATCH] xen: x86: Use new macro for debugreg Make use of the 2 new macro set_debugreg and get_debugreg. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/process.c | 12 ++++++------ arch/i386/kernel/signal.c | 2 +- arch/i386/kernel/traps.c | 6 ++---- arch/i386/power/cpu.c | 14 +++++++------- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index d199e525680a..b9954248d0aa 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -635,7 +635,7 @@ void __init cpu_init (void) /* Clear all 6 debug registers: */ -#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); +#define CD(register) set_debugreg(0, register) CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3c3f245cca53..2468ab70c386 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -633,13 +633,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas * Now maybe reload the debug registers */ if (unlikely(next->debugreg[7])) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); + set_debugreg(current->thread.debugreg[6], 6); + set_debugreg(current->thread.debugreg[7], 7); } if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 344400787c39..839d4dc88cd4 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -624,7 +624,7 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (unlikely(current->thread.debugreg[7])) { - loaddebug(¤t->thread, 7); + set_debugreg(current->thread.debugreg[7], 7); } /* Whee! Actually deliver the signal. */ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7f729665d292..c01d7ba6d7e8 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -685,7 +685,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) unsigned int condition; struct task_struct *tsk = current; - __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) @@ -727,9 +727,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) * the signal is delivered. */ clear_dr7: - __asm__("movl %0,%%db7" - : /* no output */ - : "r" (0)); + set_debugreg(0, 7); return; debug_vm86: diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index cf337c673d92..6f521cf19a13 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -94,13 +94,13 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg[7]){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); + /* no 4 and 5 */ + set_debugreg(current->thread.debugreg[6], 6); + set_debugreg(current->thread.debugreg[7], 7); } } From fa1e1bdf78d405f9905b8290ee9211e7a7bbc99b Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:44 -0700 Subject: [PATCH 090/199] [PATCH] xen: x86: Rename usermode macro Rename user_mode to user_mode_vm and add a user_mode macro similar to the x86-64 one. This is useful for Xen because the linux xen kernel does not runs on the same priviledge that a vanilla linux kernel, and with this we just need to redefine user_mode(). Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/apic.c | 2 +- arch/i386/kernel/ptrace.c | 2 +- arch/i386/mach-voyager/voyager_smp.c | 2 +- arch/i386/oprofile/backtrace.c | 2 +- include/asm-i386/ptrace.h | 3 ++- include/asm-x86_64/ptrace.h | 1 + 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index d509836b70c3..8d993fa71754 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -1133,7 +1133,7 @@ inline void smp_local_timer_interrupt(struct pt_regs * regs) } #ifdef CONFIG_SMP - update_process_times(user_mode(regs)); + update_process_times(user_mode_vm(regs)); #endif } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index e34f651fa13c..0da59b42843c 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -668,7 +668,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) info.si_code = TRAP_BRKPT; /* User-mode eip? */ - info.si_addr = user_mode(regs) ? (void __user *) regs->eip : NULL; + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; /* Send us the fakey SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index a6e0ddd65bd0..8c8527593da0 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -1288,7 +1288,7 @@ smp_local_timer_interrupt(struct pt_regs * regs) per_cpu(prof_counter, cpu); } - update_process_times(user_mode(regs)); + update_process_times(user_mode_vm(regs)); } if( ((1<ebp; #endif - if (!user_mode(regs)) { + if (!user_mode_vm(regs)) { while (depth-- && valid_kernel_stack(head, regs)) head = dump_backtrace(head); return; diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index 8618914b3521..eef9f93870d4 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -57,7 +57,8 @@ struct pt_regs { #ifdef __KERNEL__ struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); -#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) +#define user_mode(regs) (3 & (regs)->xcs) +#define user_mode_vm(regs) ((VM_MASK & (regs)->eflags) || user_mode(regs)) #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h index 5bbc8d3141c8..ca6f15ff61d4 100644 --- a/include/asm-x86_64/ptrace.h +++ b/include/asm-x86_64/ptrace.h @@ -82,6 +82,7 @@ struct pt_regs { #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #define user_mode(regs) (!!((regs)->cs & 3)) +#define user_mode_vm(regs) user_mode(regs) #define instruction_pointer(regs) ((regs)->rip) extern unsigned long profile_pc(struct pt_regs *regs); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); From 717b594a415bfaf2dbd5e8266636488f2564c689 Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:45 -0700 Subject: [PATCH 091/199] [PATCH] xen: x86: Use more usermode macro Use the user_mode macro where it's possible. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/process.c | 2 +- arch/i386/kernel/signal.c | 2 +- arch/i386/kernel/traps.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 2468ab70c386..be3efba7caf7 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -266,7 +266,7 @@ void show_regs(struct pt_regs * regs) printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (regs->xcs & 3) + if (user_mode(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s)\n", regs->eflags, print_tainted(), system_utsname.release); diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 839d4dc88cd4..b9b8f4e20fad 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -605,7 +605,7 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->xcs & 3) != 3) + if (!user_mode(regs)) return 1; if (current->flags & PF_FREEZE) { diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index c01d7ba6d7e8..e4d4e2162c7a 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -210,7 +210,7 @@ void show_registers(struct pt_regs *regs) esp = (unsigned long) (®s->esp); ss = __KERNEL_DS; - if (regs->xcs & 3) { + if (user_mode(regs)) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -266,7 +266,7 @@ static void handle_BUG(struct pt_regs *regs) char c; unsigned long eip; - if (regs->xcs & 3) + if (user_mode(regs)) goto no_bug; /* Not in kernel */ eip = regs->eip; @@ -354,7 +354,7 @@ void die(const char * str, struct pt_regs * regs, long err) static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!user_mode_vm(regs)) die(str, regs, err); } @@ -367,7 +367,7 @@ static void do_trap(int trapnr, int signr, char *str, int vm86, goto trap_signal; } - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto kernel_trap; trap_signal: { @@ -489,7 +489,7 @@ fastcall void do_general_protection(struct pt_regs * regs, long error_code) if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto gp_in_kernel; current->thread.error_code = error_code; @@ -716,7 +716,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) * check for kernel mode by just checking the CPL * of CS. */ - if ((regs->xcs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; } From e9129e56e9ec50c0689eb4cf7a3ca132f1e776db Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:46 -0700 Subject: [PATCH 092/199] [PATCH] xen: x86_64: Add macro for debugreg Add 2 macros to set and get debugreg on x86_64. This is useful for Xen because it will need only to redefine each macro to a hypervisor call. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/signal.c | 2 +- arch/x86_64/kernel/traps.c | 4 ++-- include/asm-x86_64/processor.h | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 3c5f30893715..0446e8b96a20 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -439,7 +439,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (current->thread.debugreg7) - asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ return handle_signal(signr, &info, &ka, oldset, regs); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 3dfec8fdabcd..21706c1a0f95 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -669,7 +669,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) } #endif - asm("movq %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) @@ -721,7 +721,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) info.si_addr = (void __user *)regs->rip; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - asm volatile("movq %0,%%db7"::"r"(0UL)); + set_debugreg(0UL, 7); return; clear_TF_reenable: diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 8b55f139968f..106f666517bb 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -280,6 +280,14 @@ struct thread_struct { set_fs(USER_DS); \ } while(0) +#define get_debugreg(var, register) \ + __asm__("movq %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movq %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + struct task_struct; struct mm_struct; From 76381fee7e8feb4c22be636aa5d4765dbe4fbf9e Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:46 -0700 Subject: [PATCH 093/199] [PATCH] xen: x86_64: use more usermode macro Make use of the user_mode macro where it's possible. This is useful for Xen because it will need only to redefine only the macro to a hypervisor call. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/signal.c | 2 +- arch/x86_64/kernel/traps.c | 18 +++++++++--------- arch/x86_64/mm/fault.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 0446e8b96a20..98b7ba95d581 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -422,7 +422,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->cs & 3) != 3) + if (!user_mode(regs)) return 1; if (try_to_freeze(0)) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 21706c1a0f95..121646fc43f6 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = (regs->cs & 3) == 0; + int in_kernel = !user_mode(regs); unsigned long rsp; const int cpu = safe_smp_processor_id(); struct task_struct *cur = cpu_pda[cpu].pcurrent; @@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs) struct bug_frame f; char tmp; - if (regs->cs & 3) + if (user_mode(regs)) return; if (__copy_from_user(&f, (struct bug_frame *) regs->rip, sizeof(struct bug_frame))) @@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str, } #endif - if ((regs->cs & 3) != 0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, signr)) @@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) } #endif - if ((regs->cs & 3)!=0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) @@ -638,7 +638,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) if (eregs == (struct pt_regs *)eregs->rsp) ; /* Exception from user space */ - else if (eregs->cs & 3) + else if (user_mode(eregs)) regs = ((struct pt_regs *)current->thread.rsp0) - 1; /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ @@ -697,7 +697,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) * allowing programs to debug themselves without the ptrace() * interface. */ - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; /* * Was the TF flag set by a debugger? If so, clear it now, @@ -715,7 +715,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_dr7; info.si_addr = (void __user *)regs->rip; @@ -756,7 +756,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) unsigned short cwd, swd; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel x87 math error")) return; @@ -822,7 +822,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) unsigned short mxcsr; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error")) return; diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 57d3ab15a5c7..2f187986f940 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -74,7 +74,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, instr = (unsigned char *)convert_rip_to_linear(current, regs); max_instr = instr + 15; - if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; while (scan_more && instr < max_instr) { @@ -106,7 +106,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, /* Could check the LDT for lm, but for now it's good enough to assume that long mode only uses well known segments or kernel. */ - scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); break; case 0x60: From 15d20bfd606c4b4454aeaa05fc86f77994e48c92 Mon Sep 17 00:00:00 2001 From: Domen Puncer Date: Thu, 23 Jun 2005 00:08:47 -0700 Subject: [PATCH 094/199] [PATCH] ptrace_h8300: condition bugfix Assignment doesn't make much sense here as condition would always be true. Signed-off-by: Domen Puncer Signed-off-by: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/h8300/platform/h8300h/ptrace_h8300h.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/h8300/platform/h8300h/ptrace_h8300h.c b/arch/h8300/platform/h8300h/ptrace_h8300h.c index 18e51a7167d3..6ac93c05a1ae 100644 --- a/arch/h8300/platform/h8300h/ptrace_h8300h.c +++ b/arch/h8300/platform/h8300h/ptrace_h8300h.c @@ -245,12 +245,12 @@ static unsigned short *getnextpc(struct task_struct *child, unsigned short *pc) addr = h8300_get_reg(child, regno-1+PT_ER1); return (unsigned short *)addr; case relb: - if ((inst = 0x55) || isbranch(child,inst & 0x0f)) + if (inst == 0x55 || isbranch(child,inst & 0x0f)) pc = (unsigned short *)((unsigned long)pc + ((signed char)(*fetch_p))); return pc+1; /* skip myself */ case relw: - if ((inst = 0x5c) || isbranch(child,(*fetch_p & 0xf0) >> 4)) + if (inst == 0x5c || isbranch(child,(*fetch_p & 0xf0) >> 4)) pc = (unsigned short *)((unsigned long)pc + ((signed short)(*(pc+1)))); return pc+2; /* skip myself */ From 2bf0fdad51c6710bf15d0bf4b9b30b8498fe4ddd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:48 -0700 Subject: [PATCH 095/199] [PATCH] blk: use find_first_zero_bit() in blk_queue_start_tag() blk_queue_start_tag() hand-coded searching for the first zero bit in the tag map. Replace it with find_first_zero_bit(). With this patch, blk_queue_star_tag() doesn't need to fill remains of tag map with 1, thus allowing it to work properly with the next remove_real_max_depth patch. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index cd8cf302068c..808390c74200 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -968,8 +968,7 @@ EXPORT_SYMBOL(blk_queue_end_tag); int blk_queue_start_tag(request_queue_t *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - unsigned long *map = bqt->tag_map; - int tag = 0; + int tag; if (unlikely((rq->flags & REQ_QUEUED))) { printk(KERN_ERR @@ -978,14 +977,10 @@ int blk_queue_start_tag(request_queue_t *q, struct request *rq) BUG(); } - for (map = bqt->tag_map; *map == -1UL; map++) { - tag += BLK_TAGS_PER_LONG; - - if (tag >= bqt->max_depth) - return 1; - } + tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); + if (tag >= bqt->max_depth) + return 1; - tag += ffz(*map); __set_bit(tag, bqt->tag_map); rq->flags |= REQ_QUEUED; From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:49 -0700 Subject: [PATCH 096/199] [PATCH] blk: remove blk_queue_tag->real_max_depth optimization blk_queue_tag->real_max_depth was used to optimize out unnecessary allocations/frees on tag resize. However, the whole thing was very broken - tag_map was never allocated to real_max_depth resulting in access beyond the end of the map, bits in [max_depth..real_max_depth] were set when initializing a map and copied when resizing resulting in pre-occupied tags. As the gain of the optimization is very small, well, almost nill, remove the whole thing. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 35 ++++++++++------------------------- include/linux/blkdev.h | 1 - 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 808390c74200..896d17c28f42 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -717,7 +717,7 @@ struct request *blk_queue_find_tag(request_queue_t *q, int tag) { struct blk_queue_tag *bqt = q->queue_tags; - if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) + if (unlikely(bqt == NULL || tag >= bqt->max_depth)) return NULL; return bqt->tag_index[tag]; @@ -775,9 +775,9 @@ EXPORT_SYMBOL(blk_queue_free_tags); static int init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) { - int bits, i; struct request **tag_index; unsigned long *tag_map; + int nr_ulongs; if (depth > q->nr_requests * 2) { depth = q->nr_requests * 2; @@ -789,24 +789,17 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) if (!tag_index) goto fail; - bits = (depth / BLK_TAGS_PER_LONG) + 1; - tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); + nr_ulongs = ALIGN(depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG; + tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); if (!tag_map) goto fail; memset(tag_index, 0, depth * sizeof(struct request *)); - memset(tag_map, 0, bits * sizeof(unsigned long)); + memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); tags->max_depth = depth; - tags->real_max_depth = bits * BITS_PER_LONG; tags->tag_index = tag_index; tags->tag_map = tag_map; - /* - * set the upper bits if the depth isn't a multiple of the word size - */ - for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++) - __set_bit(i, tag_map); - return 0; fail: kfree(tag_index); @@ -871,32 +864,24 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth) struct blk_queue_tag *bqt = q->queue_tags; struct request **tag_index; unsigned long *tag_map; - int bits, max_depth; + int max_depth, nr_ulongs; if (!bqt) return -ENXIO; - /* - * don't bother sizing down - */ - if (new_depth <= bqt->real_max_depth) { - bqt->max_depth = new_depth; - return 0; - } - /* * save the old state info, so we can copy it back */ tag_index = bqt->tag_index; tag_map = bqt->tag_map; - max_depth = bqt->real_max_depth; + max_depth = bqt->max_depth; if (init_tag_map(q, bqt, new_depth)) return -ENOMEM; memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); - bits = max_depth / BLK_TAGS_PER_LONG; - memcpy(bqt->tag_map, tag_map, bits * sizeof(unsigned long)); + nr_ulongs = ALIGN(max_depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG; + memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); kfree(tag_index); kfree(tag_map); @@ -926,7 +911,7 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq) BUG_ON(tag == -1); - if (unlikely(tag >= bqt->real_max_depth)) + if (unlikely(tag >= bqt->max_depth)) return; if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 235c3414d268..8d7e2f4151d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -294,7 +294,6 @@ struct blk_queue_tag { struct list_head busy_list; /* fifo list of busy tags */ int busy; /* current depth */ int max_depth; /* what we will send to device */ - int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ }; From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:50 -0700 Subject: [PATCH 097/199] [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK} Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 4 ++-- include/linux/blkdev.h | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 896d17c28f42..99afeec1031f 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -789,7 +789,7 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) if (!tag_index) goto fail; - nr_ulongs = ALIGN(depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG; + nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); if (!tag_map) goto fail; @@ -880,7 +880,7 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth) return -ENOMEM; memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); - nr_ulongs = ALIGN(max_depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG; + nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); kfree(tag_index); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8d7e2f4151d0..60272141ff19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -285,9 +285,6 @@ enum blk_queue_state { Queue_up, }; -#define BLK_TAGS_PER_LONG (sizeof(unsigned long) * 8) -#define BLK_TAGS_MASK (BLK_TAGS_PER_LONG - 1) - struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ From 040c928c47b591d1cd9977cd6431cae213528b45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:51 -0700 Subject: [PATCH 098/199] [PATCH] blk: cleanup generic tag support error messages Add KERN_ERR and __FUNCTION__ to generic tag error messages, and add a comment in blk_queue_end_tag() which explains the silent failure path. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 99afeec1031f..c581f9138c47 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -912,10 +912,15 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq) BUG_ON(tag == -1); if (unlikely(tag >= bqt->max_depth)) + /* + * This can happen after tag depth has been reduced. + * FIXME: how about a warning or info message here? + */ return; if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { - printk("attempt to clear non-busy tag (%d)\n", tag); + printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", + __FUNCTION__, tag); return; } @@ -924,7 +929,8 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq) rq->tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) - printk("tag %d is missing\n", tag); + printk(KERN_ERR "%s: tag %d is missing\n", + __FUNCTION__, tag); bqt->tag_index[tag] = NULL; bqt->busy--; @@ -957,8 +963,9 @@ int blk_queue_start_tag(request_queue_t *q, struct request *rq) if (unlikely((rq->flags & REQ_QUEUED))) { printk(KERN_ERR - "request %p for device [%s] already tagged %d", - rq, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); + "%s: request %p for device [%s] already tagged %d", + __FUNCTION__, rq, + rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); BUG(); } @@ -1001,7 +1008,8 @@ void blk_queue_invalidate_tags(request_queue_t *q) rq = list_entry_rq(tmp); if (rq->tag == -1) { - printk("bad tag found on list\n"); + printk(KERN_ERR + "%s: bad tag found on list\n", __FUNCTION__); list_del_init(&rq->queuelist); rq->flags &= ~REQ_QUEUED; } else From 250dccc00805e755a6d80a73557034253da0831f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:08:52 -0700 Subject: [PATCH 099/199] [PATCH] blk: no memory barrier This memory barrier is not needed because the waitqueue will only get waiters on it in the following situations: rq->count has exceeded the threshold - however all manipulations of ->count are performed under the runqueue lock, and so we will correctly pick up any waiter. Memory allocation for the request fails. In this case, there is no additional help provided by the memory barrier. We are guaranteed to eventually wake up waiters because the request allocation mempool guarantees that if the mem allocation for a request fails, there must be some requests in flight. They will wake up waiters when they are retired. Signed-off-by: Nick Piggin Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index c581f9138c47..265de858d4de 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1842,7 +1842,6 @@ static void __freed_request(request_queue_t *q, int rw) clear_queue_congested(q, rw); if (rl->count[rw] + 1 <= q->nr_requests) { - smp_mb(); if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); From fde6ad22480cdc2eaa102b805a3ed3ee1d36a376 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:08:53 -0700 Subject: [PATCH 100/199] [PATCH] blk: branch hints Sprinkle around a few branch hints in the block layer. Signed-off-by: Nick Piggin Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 265de858d4de..1e847151c3a3 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1439,7 +1439,7 @@ EXPORT_SYMBOL(blk_remove_plug); */ void __generic_unplug_device(request_queue_t *q) { - if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) return; if (!blk_remove_plug(q)) @@ -1763,7 +1763,7 @@ EXPORT_SYMBOL(blk_init_queue_node); int blk_get_queue(request_queue_t *q) { - if (!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { atomic_inc(&q->refcnt); return 0; } @@ -2584,7 +2584,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) spin_lock_prefetch(q->queue_lock); barrier = bio_barrier(bio); - if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) { + if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { err = -EOPNOTSUPP; goto end_io; } @@ -2685,7 +2685,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) /* * REQ_BARRIER implies no merging, but lets make it explicit */ - if (barrier) + if (unlikely(barrier)) req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); req->errors = 0; @@ -2809,7 +2809,7 @@ static inline void block_wait_queue_running(request_queue_t *q) { DEFINE_WAIT(wait); - while (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) { + while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) { struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->drain, &wait, @@ -2918,7 +2918,7 @@ void generic_make_request(struct bio *bio) goto end_io; } - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; block_wait_queue_running(q); From bdd646a44672115c986593956aa4ef105485a184 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:08:54 -0700 Subject: [PATCH 101/199] [PATCH] blk: unplug later get_request_wait needn't unplug the device immediately. Signed-off-by: Nick Piggin Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ll_rw_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 1e847151c3a3..fd94ea27d594 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1969,7 +1969,6 @@ static struct request *get_request_wait(request_queue_t *q, int rw) DEFINE_WAIT(wait); struct request *rq; - generic_unplug_device(q); do { struct request_list *rl = &q->rq; @@ -1981,6 +1980,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw) if (!rq) { struct io_context *ioc; + generic_unplug_device(q); io_schedule(); /* From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:56 -0700 Subject: [PATCH 102/199] [PATCH] timers fixes/improvements This patch tries to solve following problems: 1. del_timer_sync() is racy. The timer can be fired again after del_timer_sync have checked all cpus and before it will recheck timer_pending(). 2. It has scalability problems. All cpus are scanned to determine if the timer is running on that cpu. With this patch del_timer_sync is O(1) and no slower than plain del_timer(pending_timer), unless it has to actually wait for completion of the currently running timer. The only restriction is that the recurring timer should not use add_timer_on(). 3. The timers are not serialized wrt to itself. If CPU_0 does mod_timer(jiffies+1) while the timer is currently running on CPU 1, it is quite possible that local interrupt on CPU_0 will start that timer before it finished on CPU_1. 4. The timers locking is suboptimal. __mod_timer() takes 3 locks at once and still requires wmb() in del_timer/run_timers. The new implementation takes 2 locks sequentially and does not need memory barriers. Currently ->base != NULL means that the timer is pending. In that case ->base.lock is used to lock the timer. __mod_timer also takes timer->lock because ->base can be == NULL. This patch uses timer->entry.next != NULL as indication that the timer is pending. So it does __list_del(), entry->next = NULL instead of list_del() when the timer is deleted. The ->base field is used for hashed locking only, it is initialized in init_timer() which sets ->base = per_cpu(tvec_bases). When the tvec_bases.lock is locked, it means that all timers which are tied to this base via timer->base are locked, and the base itself is locked too. So __run_timers/migrate_timers can safely modify all timers which could be found on ->tvX lists (pending timers). When the timer's base is locked, and the timer removed from ->entry list (which means that _run_timers/migrate_timers can't see this timer), it is possible to set timer->base = NULL and drop the lock: the timer remains locked. This patch adds lock_timer_base() helper, which waits for ->base != NULL, locks the ->base, and checks it is still the same. __mod_timer() schedules the timer on the local CPU and changes it's base. However, it does not lock both old and new bases at once. It locks the timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base, and adds this timer. This simplifies the code, because AB-BA deadlock is not possible. __mod_timer() also ensures that the timer's base is not changed while the timer's handler is running on the old base. __run_timers(), del_timer() do not change ->base anymore, they only clear pending flag. So del_timer_sync() can test timer->base->running_timer == timer to detect whether it is running or not. We don't need timer_list->lock anymore, this patch kills it. We also don't need barriers. del_timer() and __run_timers() used smp_wmb() before clearing timer's pending flag. It was needed because __mod_timer() did not lock old_base if the timer is not pending, so __mod_timer()->list_add() could race with del_timer()->list_del(). With this patch these functions are serialized through base->lock. One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch adds global struct timer_base_s { spinlock_t lock; struct timer_list *running_timer; } __init_timer_base; which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s struct are replaced by struct timer_base_s t_base. It is indeed ugly. But this can't have scalability problems. The global __init_timer_base.lock is used only when __mod_timer() is called for the first time AND the timer was compile time initialized. After that the timer migrates to the local CPU. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Signed-off-by: Renaud Lienhart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 30 ++-- kernel/timer.c | 328 ++++++++++++++++++++---------------------- 2 files changed, 166 insertions(+), 192 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 90db1cc62ddd..2e78fedfc069 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -6,45 +6,33 @@ #include #include -struct tvec_t_base_s; +struct timer_base_s; struct timer_list { struct list_head entry; unsigned long expires; - spinlock_t lock; unsigned long magic; void (*function)(unsigned long); unsigned long data; - struct tvec_t_base_s *base; + struct timer_base_s *base; }; #define TIMER_MAGIC 0x4b87ad6e +extern struct timer_base_s __init_timer_base; + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = NULL, \ + .base = &__init_timer_base, \ .magic = TIMER_MAGIC, \ - .lock = SPIN_LOCK_UNLOCKED, \ } -/*** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. - */ -static inline void init_timer(struct timer_list * timer) -{ - timer->base = NULL; - timer->magic = TIMER_MAGIC; - spin_lock_init(&timer->lock); -} +void fastcall init_timer(struct timer_list * timer); /*** * timer_pending - is a timer pending? @@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer) */ static inline int timer_pending(const struct timer_list * timer) { - return timer->base != NULL; + return timer->entry.next != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); @@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer) #ifdef CONFIG_SMP extern int del_timer_sync(struct timer_list *timer); - extern int del_singleshot_timer_sync(struct timer_list *timer); #else # define del_timer_sync(t) del_timer(t) -# define del_singleshot_timer_sync(t) del_timer(t) #endif +#define del_singleshot_timer_sync(t) del_timer_sync(t) + extern void init_timers(void); extern void run_local_timers(void); extern void it_real_fn(unsigned long); diff --git a/kernel/timer.c b/kernel/timer.c index 207aa4f0aa10..8aadc62efd65 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec); #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +struct timer_base_s { + spinlock_t lock; + struct timer_list *running_timer; +}; + typedef struct tvec_s { struct list_head vec[TVN_SIZE]; } tvec_t; @@ -66,9 +71,8 @@ typedef struct tvec_root_s { } tvec_root_t; struct tvec_t_base_s { - spinlock_t lock; + struct timer_base_s t_base; unsigned long timer_jiffies; - struct timer_list *running_timer; tvec_root_t tv1; tvec_t tv2; tvec_t tv3; @@ -77,18 +81,16 @@ struct tvec_t_base_s { } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; +static DEFINE_PER_CPU(tvec_base_t, tvec_bases); static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { #ifdef CONFIG_SMP - base->running_timer = timer; + base->t_base.running_timer = timer; #endif } -/* Fake initialization */ -static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; - static void check_timer_failed(struct timer_list *timer) { static int whine_count; @@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer) /* * Now fix it up */ - spin_lock_init(&timer->lock); timer->magic = TIMER_MAGIC; } @@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +typedef struct timer_base_s timer_base_t; +/* + * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) + * at compile time, and we need timer->base to lock the timer. + */ +timer_base_t __init_timer_base + ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; +EXPORT_SYMBOL(__init_timer_base); + +/*** + * init_timer - initialize a timer. + * @timer: the timer to be initialized + * + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void fastcall init_timer(struct timer_list *timer) +{ + timer->entry.next = NULL; + timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; + timer->magic = TIMER_MAGIC; +} +EXPORT_SYMBOL(init_timer); + +static inline void detach_timer(struct timer_list *timer, + int clear_pending) +{ + struct list_head *entry = &timer->entry; + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static timer_base_t *lock_timer_base(struct timer_list *timer, + unsigned long *flags) +{ + timer_base_t *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + int __mod_timer(struct timer_list *timer, unsigned long expires) { - tvec_base_t *old_base, *new_base; + timer_base_t *base; + tvec_base_t *new_base; unsigned long flags; int ret = 0; BUG_ON(!timer->function); - check_timer(timer); - spin_lock_irqsave(&timer->lock, flags); + base = lock_timer_base(timer, &flags); + + if (timer_pending(timer)) { + detach_timer(timer, 0); + ret = 1; + } + new_base = &__get_cpu_var(tvec_bases); -repeat: - old_base = timer->base; - /* - * Prevent deadlocks via ordering by old_base < new_base. - */ - if (old_base && (new_base != old_base)) { - if (old_base < new_base) { - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - } else { - spin_lock(&old_base->lock); - spin_lock(&new_base->lock); - } + if (base != &new_base->t_base) { /* - * The timer base might have been cancelled while we were - * trying to take the lock(s): + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. */ - if (timer->base != old_base) { - spin_unlock(&new_base->lock); - spin_unlock(&old_base->lock); - goto repeat; - } - } else { - spin_lock(&new_base->lock); - if (timer->base != old_base) { - spin_unlock(&new_base->lock); - goto repeat; + if (unlikely(base->running_timer == timer)) { + /* The timer remains on a former base */ + new_base = container_of(base, tvec_base_t, t_base); + } else { + /* See the comment in lock_timer_base() */ + timer->base = NULL; + spin_unlock(&base->lock); + spin_lock(&new_base->t_base.lock); + timer->base = &new_base->t_base; } } - /* - * Delete the previous timeout (if there was any), and install - * the new one: - */ - if (old_base) { - list_del(&timer->entry); - ret = 1; - } timer->expires = expires; internal_add_timer(new_base, timer); - timer->base = new_base; - - if (old_base && (new_base != old_base)) - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - spin_unlock_irqrestore(&timer->lock, flags); + spin_unlock_irqrestore(&new_base->t_base.lock, flags); return ret; } @@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = &per_cpu(tvec_bases, cpu); unsigned long flags; - + BUG_ON(timer_pending(timer) || !timer->function); check_timer(timer); - spin_lock_irqsave(&base->lock, flags); + spin_lock_irqsave(&base->t_base.lock, flags); + timer->base = &base->t_base; internal_add_timer(base, timer); - timer->base = base; - spin_unlock_irqrestore(&base->lock, flags); + spin_unlock_irqrestore(&base->t_base.lock, flags); } @@ -295,27 +344,22 @@ EXPORT_SYMBOL(mod_timer); */ int del_timer(struct timer_list *timer) { + timer_base_t *base; unsigned long flags; - tvec_base_t *base; + int ret = 0; check_timer(timer); -repeat: - base = timer->base; - if (!base) - return 0; - spin_lock_irqsave(&base->lock, flags); - if (base != timer->base) { + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } spin_unlock_irqrestore(&base->lock, flags); - goto repeat; } - list_del(&timer->entry); - /* Need to make sure that anybody who sees a NULL base also sees the list ops */ - smp_wmb(); - timer->base = NULL; - spin_unlock_irqrestore(&base->lock, flags); - return 1; + return ret; } EXPORT_SYMBOL(del_timer); @@ -332,72 +376,39 @@ EXPORT_SYMBOL(del_timer); * Synchronization rules: callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. Upon exit the timer is not queued and - * the handler is not running on any CPU. + * completion of the timer's handler. The timer's handler must not call + * add_timer_on(). Upon exit the timer is not queued and the handler is + * not running on any CPU. * * The function returns whether it has deactivated a pending timer or not. - * - * del_timer_sync() is slow and complicated because it copes with timer - * handlers which re-arm the timer (periodic timers). If the timer handler - * is known to not do this (a single shot timer) then use - * del_singleshot_timer_sync() instead. */ int del_timer_sync(struct timer_list *timer) { - tvec_base_t *base; - int i, ret = 0; + timer_base_t *base; + unsigned long flags; + int ret = -1; check_timer(timer); -del_again: - ret += del_timer(timer); + do { + base = lock_timer_base(timer, &flags); - for_each_online_cpu(i) { - base = &per_cpu(tvec_bases, i); - if (base->running_timer == timer) { - while (base->running_timer == timer) { - cpu_relax(); - preempt_check_resched(); - } - break; + if (base->running_timer == timer) + goto unlock; + + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; } - } - smp_rmb(); - if (timer_pending(timer)) - goto del_again; +unlock: + spin_unlock_irqrestore(&base->lock, flags); + } while (ret < 0); return ret; } -EXPORT_SYMBOL(del_timer_sync); -/*** - * del_singleshot_timer_sync - deactivate a non-recursive timer - * @timer: the timer to be deactivated - * - * This function is an optimization of del_timer_sync for the case where the - * caller can guarantee the timer does not reschedule itself in its timer - * function. - * - * Synchronization rules: callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which wold prevent - * completion of the timer's handler. Upon exit the timer is not queued and - * the handler is not running on any CPU. - * - * The function returns whether it has deactivated a pending timer or not. - */ -int del_singleshot_timer_sync(struct timer_list *timer) -{ - int ret = del_timer(timer); - - if (!ret) { - ret = del_timer_sync(timer); - BUG_ON(ret); - } - - return ret; -} -EXPORT_SYMBOL(del_singleshot_timer_sync); +EXPORT_SYMBOL(del_timer_sync); #endif static int cascade(tvec_base_t *base, tvec_t *tv, int index) @@ -415,7 +426,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) struct timer_list *tmp; tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); + BUG_ON(tmp->base != &base->t_base); curr = curr->next; internal_add_timer(base, tmp); } @@ -437,7 +448,7 @@ static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; - spin_lock_irq(&base->lock); + spin_lock_irq(&base->t_base.lock); while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; @@ -453,8 +464,7 @@ static inline void __run_timers(tvec_base_t *base) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; list_splice_init(base->tv1.vec + index, &work_list); -repeat: - if (!list_empty(head)) { + while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -462,11 +472,9 @@ static inline void __run_timers(tvec_base_t *base) fn = timer->function; data = timer->data; - list_del(&timer->entry); set_running_timer(base, timer); - smp_wmb(); - timer->base = NULL; - spin_unlock_irq(&base->lock); + detach_timer(timer, 1); + spin_unlock_irq(&base->t_base.lock); { u32 preempt_count = preempt_count(); fn(data); @@ -475,12 +483,11 @@ static inline void __run_timers(tvec_base_t *base) BUG(); } } - spin_lock_irq(&base->lock); - goto repeat; + spin_lock_irq(&base->t_base.lock); } } set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); + spin_unlock_irq(&base->t_base.lock); } #ifdef CONFIG_NO_IDLE_HZ @@ -499,7 +506,7 @@ unsigned long next_timer_interrupt(void) int i, j; base = &__get_cpu_var(tvec_bases); - spin_lock(&base->lock); + spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = 0; @@ -547,7 +554,7 @@ unsigned long next_timer_interrupt(void) expires = nte->expires; } } - spin_unlock(&base->lock); + spin_unlock(&base->t_base.lock); return expires; } #endif @@ -1286,9 +1293,9 @@ static void __devinit init_timers_cpu(int cpu) { int j; tvec_base_t *base; - + base = &per_cpu(tvec_bases, cpu); - spin_lock_init(&base->lock); + spin_lock_init(&base->t_base.lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); @@ -1302,22 +1309,16 @@ static void __devinit init_timers_cpu(int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) +static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) { struct timer_list *timer; while (!list_empty(head)) { timer = list_entry(head->next, struct timer_list, entry); - /* We're locking backwards from __mod_timer order here, - beware deadlock. */ - if (!spin_trylock(&timer->lock)) - return 0; - list_del(&timer->entry); + detach_timer(timer, 0); + timer->base = &new_base->t_base; internal_add_timer(new_base, timer); - timer->base = new_base; - spin_unlock(&timer->lock); } - return 1; } static void __devinit migrate_timers(int cpu) @@ -1331,39 +1332,24 @@ static void __devinit migrate_timers(int cpu) new_base = &get_cpu_var(tvec_bases); local_irq_disable(); -again: - /* Prevent deadlocks via ordering by old_base < new_base. */ - if (old_base < new_base) { - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - } else { - spin_lock(&old_base->lock); - spin_lock(&new_base->lock); - } + spin_lock(&new_base->t_base.lock); + spin_lock(&old_base->t_base.lock); - if (old_base->running_timer) + if (old_base->t_base.running_timer) BUG(); for (i = 0; i < TVR_SIZE; i++) - if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) - goto unlock_again; - for (i = 0; i < TVN_SIZE; i++) - if (!migrate_timer_list(new_base, old_base->tv2.vec + i) - || !migrate_timer_list(new_base, old_base->tv3.vec + i) - || !migrate_timer_list(new_base, old_base->tv4.vec + i) - || !migrate_timer_list(new_base, old_base->tv5.vec + i)) - goto unlock_again; - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->t_base.lock); + spin_unlock(&new_base->t_base.lock); local_irq_enable(); put_cpu_var(tvec_bases); - return; - -unlock_again: - /* Avoid deadlock with __mod_timer, by backing off. */ - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - cpu_relax(); - goto again; } #endif /* CONFIG_HOTPLUG_CPU */ From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:59 -0700 Subject: [PATCH 103/199] [PATCH] timers: introduce try_to_del_timer_sync() This patch splits del_timer_sync() into 2 functions. The new one, try_to_del_timer_sync(), returns -1 when it hits executing timer. It can be used in interrupt context, or when the caller hold locks which can prevent completion of the timer's handler. NOTE. Currently it can't be used in interrupt context in UP case, because ->running_timer is used only with CONFIG_SMP. Should the need arise, it is possible to kill #ifdef CONFIG_SMP in set_running_timer(), it is cheap. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 4 +++- kernel/timer.c | 53 +++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 2e78fedfc069..221f81ac2002 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer) } #ifdef CONFIG_SMP + extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else -# define del_timer_sync(t) del_timer(t) +# define try_to_del_timer_sync(t) del_timer(t) +# define del_timer_sync(t) del_timer(t) #endif #define del_singleshot_timer_sync(t) del_timer_sync(t) diff --git a/kernel/timer.c b/kernel/timer.c index 8aadc62efd65..1f986c16d89f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -365,6 +365,34 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP +/* + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + timer_base_t *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + /*** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -384,28 +412,13 @@ EXPORT_SYMBOL(del_timer); */ int del_timer_sync(struct timer_list *timer) { - timer_base_t *base; - unsigned long flags; - int ret = -1; - check_timer(timer); - do { - base = lock_timer_base(timer, &flags); - - if (base->running_timer == timer) - goto unlock; - - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - ret = 1; - } -unlock: - spin_unlock_irqrestore(&base->lock, flags); - } while (ret < 0); - - return ret; + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + } } EXPORT_SYMBOL(del_timer_sync); From f972be33ce6a08b5f096ba013c7459a3a82f5f39 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:09:00 -0700 Subject: [PATCH 104/199] [PATCH] posix-timers: use try_to_del_timer_sync() sys_timer_settime/sys_timer_delete needs to delete k_itimer->real.timer synchronously while holding ->it_lock, which is also locked in posix_timer_fn. This patch removes timer_active/set_timer_inactive which plays with timer_list's internals in favour of using try_to_del_timer_sync(), which was introduced in the previous patch. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index cabb63fc9e16..5b7b4736d82b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -88,23 +88,6 @@ static kmem_cache_t *posix_timers_cache; static struct idr posix_timers_id; static DEFINE_SPINLOCK(idr_lock); -/* - * Just because the timer is not in the timer list does NOT mean it is - * inactive. It could be in the "fire" routine getting a new expire time. - */ -#define TIMER_INACTIVE 1 - -#ifdef CONFIG_SMP -# define timer_active(tmr) \ - ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) -# define set_timer_inactive(tmr) \ - do { \ - (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ - } while (0) -#else -# define timer_active(tmr) BARFY // error to use outside of SMP -# define set_timer_inactive(tmr) do { } while (0) -#endif /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. @@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer) init_timer(&new_timer->it.real.timer); new_timer->it.real.timer.data = (unsigned long) new_timer; new_timer->it.real.timer.function = posix_timer_fn; - set_timer_inactive(new_timer); return 0; } @@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data) int do_notify = 1; spin_lock_irqsave(&timr->it_lock, flags); - set_timer_inactive(timr); if (!list_empty(&timr->it.real.abs_timer_entry)) { spin_lock(&abs_list.lock); do { @@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags, * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. */ + if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { #ifdef CONFIG_SMP - if (timer_active(timr) && !del_timer(&timr->it.real.timer)) /* * It can only be active if on an other cpu. Since * we have cleared the interval stuff above, it should @@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags, * a "retry" exit status. */ return TIMER_RETRY; - - set_timer_inactive(timr); -#else - del_timer(&timr->it.real.timer); #endif + } + remove_from_abslist(timr); timr->it_requeue_pending = (timr->it_requeue_pending + 2) & @@ -1083,8 +1062,9 @@ sys_timer_settime(timer_t timer_id, int flags, static inline int common_timer_del(struct k_itimer *timer) { timer->it.real.incr = 0; + + if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { #ifdef CONFIG_SMP - if (timer_active(timer) && !del_timer(&timer->it.real.timer)) /* * It can only be active if on an other cpu. Since * we have cleared the interval stuff above, it should @@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer) * a "retry" exit status. */ return TIMER_RETRY; -#else - del_timer(&timer->it.real.timer); #endif + } + remove_from_abslist(timer); return 0; From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001 From: Alexander Viro Date: Thu, 23 Jun 2005 00:09:01 -0700 Subject: [PATCH 105/199] [PATCH] fix for prune_icache()/forced final iput() races Based on analysis and a patch from Russ Weight There is a race condition that can occur if an inode is allocated and then released (using iput) during the ->fill_super functions. The race condition is between kswapd and mount. For most filesystems this can only happen in an error path when kswapd is running concurrently. For isofs, however, the error can occur in a more common code path (which is how the bug was found). The logic here is "we want final iput() to free inode *now* instead of letting it sit in cache if fs is going down or had not quite come up". The problem is with kswapd seeing such inodes in the middle of being killed and happily taking over. The clean solution would be to tell kswapd to leave those inodes alone and let our final iput deal with them. I.e. add a new flag (I_FORCED_FREEING), set it before write_inode_now() there and make prune_icache() leave those alone. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 16 ++++++++++------ include/linux/fs.h | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 801fe7f36280..1f9a3a2b89bc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -500,7 +500,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea continue; if (!test(inode, data)) continue; - if (inode->i_state & (I_FREEING|I_CLEAR)) { + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } @@ -525,7 +525,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head continue; if (inode->i_sb != sb) continue; - if (inode->i_state & (I_FREEING|I_CLEAR)) { + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } @@ -727,7 +727,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode_lock); - if (!(inode->i_state & I_FREEING)) + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) __iget(inode); else /* @@ -1024,17 +1024,21 @@ static void generic_forget_inode(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_LOCK))) list_move(&inode->i_list, &inode_unused); inodes_stat.nr_unused++; - spin_unlock(&inode_lock); - if (!sb || (sb->s_flags & MS_ACTIVE)) + if (!sb || (sb->s_flags & MS_ACTIVE)) { + spin_unlock(&inode_lock); return; + } + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); - inode->i_state|=I_FREEING; + inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); if (inode->i_data.nrpages) diff --git a/include/linux/fs.h b/include/linux/fs.h index e5a8db00df29..3622e952e98c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1025,6 +1025,7 @@ struct super_operations { #define I_FREEING 16 #define I_CLEAR 32 #define I_NEW 64 +#define I_WILL_FREE 128 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001 From: Paulo Marques Date: Thu, 23 Jun 2005 00:09:02 -0700 Subject: [PATCH 106/199] [PATCH] create a kstrdup library function This patch creates a new kstrdup library function and changes the "local" implementations in several places to use this function. Most of the changes come from the sound and net subsystems. The sound part had already been acknowledged by Takashi Iwai and the net part by David S. Miller. I left UML alone for now because I would need more time to read the code carefully before making changes there. Signed-off-by: Paulo Marques Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/dm-ioctl.c | 14 +++--------- drivers/parport/probe.c | 18 +++++---------- include/linux/netdevice.h | 4 ---- include/linux/string.h | 2 ++ include/sound/core.h | 3 ++- mm/slab.c | 24 ++++++++++++++++++++ net/core/neighbour.c | 3 ++- net/core/sysctl_net_core.c | 15 ------------- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 3 ++- net/sunrpc/svcauth_unix.c | 11 ++-------- sound/core/info.c | 3 ++- sound/core/info_oss.c | 3 ++- sound/core/memory.c | 41 ++++++++++++----------------------- sound/core/oss/mixer_oss.c | 3 ++- sound/core/oss/pcm_oss.c | 3 ++- sound/core/sound.c | 2 +- sound/core/timer.c | 3 ++- sound/isa/gus/gus_mem.c | 7 +++--- sound/pci/hda/patch_realtek.c | 2 +- sound/synth/emux/emux.c | 3 ++- 21 files changed, 75 insertions(+), 94 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index ee3c869d9701..200a0688f717 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -122,14 +122,6 @@ static struct hash_cell *__get_uuid_cell(const char *str) /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ -static inline char *kstrdup(const char *str) -{ - char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); - if (r) - strcpy(r, str); - return r; -} - static struct hash_cell *alloc_cell(const char *name, const char *uuid, struct mapped_device *md) { @@ -139,7 +131,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid, if (!hc) return NULL; - hc->name = kstrdup(name); + hc->name = kstrdup(name, GFP_KERNEL); if (!hc->name) { kfree(hc); return NULL; @@ -149,7 +141,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid, hc->uuid = NULL; else { - hc->uuid = kstrdup(uuid); + hc->uuid = kstrdup(uuid, GFP_KERNEL); if (!hc->uuid) { kfree(hc->name); kfree(hc); @@ -273,7 +265,7 @@ static int dm_hash_rename(const char *old, const char *new) /* * duplicate new. */ - new_name = kstrdup(new); + new_name = kstrdup(new, GFP_KERNEL); if (!new_name) return -ENOMEM; diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c index c94963145e17..6e6f42d01e64 100644 --- a/drivers/parport/probe.c +++ b/drivers/parport/probe.c @@ -48,14 +48,6 @@ static void pretty_print(struct parport *port, int device) printk("\n"); } -static char *strdup(char *str) -{ - int n = strlen(str)+1; - char *s = kmalloc(n, GFP_KERNEL); - if (!s) return NULL; - return strcpy(s, str); -} - static void parse_data(struct parport *port, int device, char *str) { char *txt = kmalloc(strlen(str)+1, GFP_KERNEL); @@ -88,16 +80,16 @@ static void parse_data(struct parport *port, int device, char *str) if (!strcmp(p, "MFG") || !strcmp(p, "MANUFACTURER")) { if (info->mfr) kfree (info->mfr); - info->mfr = strdup(sep); + info->mfr = kstrdup(sep, GFP_KERNEL); } else if (!strcmp(p, "MDL") || !strcmp(p, "MODEL")) { if (info->model) kfree (info->model); - info->model = strdup(sep); + info->model = kstrdup(sep, GFP_KERNEL); } else if (!strcmp(p, "CLS") || !strcmp(p, "CLASS")) { int i; if (info->class_name) kfree (info->class_name); - info->class_name = strdup(sep); + info->class_name = kstrdup(sep, GFP_KERNEL); for (u = sep; *u; u++) *u = toupper(*u); for (i = 0; classes[i].token; i++) { @@ -112,7 +104,7 @@ static void parse_data(struct parport *port, int device, char *str) !strcmp(p, "COMMAND SET")) { if (info->cmdset) kfree (info->cmdset); - info->cmdset = strdup(sep); + info->cmdset = kstrdup(sep, GFP_KERNEL); /* if it speaks printer language, it's probably a printer */ if (strstr(sep, "PJL") || strstr(sep, "PCL")) @@ -120,7 +112,7 @@ static void parse_data(struct parport *port, int device, char *str) } else if (!strcmp(p, "DES") || !strcmp(p, "DESCRIPTION")) { if (info->description) kfree (info->description); - info->description = strdup(sep); + info->description = kstrdup(sep, GFP_KERNEL); } } rock_on: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d6afd440cf7b..d89816ad642f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward); extern void net_enable_timestamp(void); extern void net_disable_timestamp(void); -#ifdef CONFIG_SYSCTL -extern char *net_sysctl_strdup(const char *s); -#endif - #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/string.h b/include/linux/string.h index b9fc59469956..93994c613095 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif +extern char *kstrdup(const char *s, int gfp); + #ifdef __cplusplus } #endif diff --git a/include/sound/core.h b/include/sound/core.h index 9117c23e3a01..f8c4ef0aa352 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -292,6 +292,7 @@ void *snd_hidden_kcalloc(size_t n, size_t size, int flags); void snd_hidden_kfree(const void *obj); void *snd_hidden_vmalloc(unsigned long size); void snd_hidden_vfree(void *obj); +char *snd_hidden_kstrdup(const char *s, int flags); #define kmalloc(size, flags) snd_hidden_kmalloc(size, flags) #define kcalloc(n, size, flags) snd_hidden_kcalloc(n, size, flags) #define kfree(obj) snd_hidden_kfree(obj) @@ -301,6 +302,7 @@ void snd_hidden_vfree(void *obj); #define vmalloc_nocheck(size) snd_wrapper_vmalloc(size) #define kfree_nocheck(obj) snd_wrapper_kfree(obj) #define vfree_nocheck(obj) snd_wrapper_vfree(obj) +#define kstrdup(s, flags) snd_hidden_kstrdup(s, flags) #else #define snd_memory_init() /*NOP*/ #define snd_memory_done() /*NOP*/ @@ -311,7 +313,6 @@ void snd_hidden_vfree(void *obj); #define kfree_nocheck(obj) kfree(obj) #define vfree_nocheck(obj) vfree(obj) #endif -char *snd_kmalloc_strdup(const char *string, int flags); int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count); int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count); diff --git a/mm/slab.c b/mm/slab.c index 93cbbbb39f42..122d031baab2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include @@ -3082,3 +3083,26 @@ unsigned int ksize(const void *objp) return size; } + + +/* + * kstrdup - allocate space for and copy an existing string + * + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrdup(const char *s, int gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strlen(s) + 1; + buf = kmalloc(len, gfp); + if (buf) + memcpy(buf, s, len); + return buf; +} +EXPORT_SYMBOL(kstrdup); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f6bdcad47da6..851eb927ed97 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -32,6 +32,7 @@ #include #include #include +#include #define NEIGH_DEBUG 1 @@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[17].extra1 = dev; } - dev_name = net_sysctl_strdup(dev_name_source); + dev_name = kstrdup(dev_name_source, GFP_KERNEL); if (!dev_name) { err = -ENOBUFS; goto free; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c8be646cb191..880a88815211 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -35,19 +35,6 @@ extern int sysctl_somaxconn; extern char sysctl_divert_version[]; #endif /* CONFIG_NET_DIVERT */ -/* - * This strdup() is used for creating copies of network - * device names to be handed over to sysctl. - */ - -char *net_sysctl_strdup(const char *s) -{ - char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); - if (rv) - strcpy(rv, s); - return rv; -} - ctl_table core_table[] = { #ifdef CONFIG_NET { @@ -177,6 +164,4 @@ ctl_table core_table[] = { { .ctl_name = 0 } }; -EXPORT_SYMBOL(net_sysctl_strdup); - #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 650dcb12d9a1..d8a10e3dd77d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev, * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = net_sysctl_strdup(dev_name); + dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) goto free; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 14f5c53235fe..a54d4ef3fd35 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -57,6 +57,7 @@ #endif #include #include +#include #include #include @@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = net_sysctl_strdup(dev_name); + dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) goto free; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 2b99b4028d31..d6baf6fdf8a9 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -8,6 +8,7 @@ #include #include #include +#include #define RPCDBG_FACILITY RPCDBG_AUTH @@ -20,14 +21,6 @@ */ -static char *strdup(char *s) -{ - char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); - if (rv) - strcpy(rv, s); - return rv; -} - struct unix_domain { struct auth_domain h; int addr_changes; @@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name) if (new == NULL) return NULL; cache_init(&new->h.h); - new->h.name = strdup(name); + new->h.name = kstrdup(name, GFP_KERNEL); new->h.flavour = RPC_AUTH_UNIX; new->addr_changes = 0; new->h.h.expiry_time = NEVER; diff --git a/sound/core/info.c b/sound/core/info.c index 31faffe01cb0..5e122bbe7c92 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -754,7 +755,7 @@ static snd_info_entry_t *snd_info_create_entry(const char *name) entry = kcalloc(1, sizeof(*entry), GFP_KERNEL); if (entry == NULL) return NULL; - entry->name = snd_kmalloc_strdup(name, GFP_KERNEL); + entry->name = kstrdup(name, GFP_KERNEL); if (entry->name == NULL) { kfree(entry); return NULL; diff --git a/sound/core/info_oss.c b/sound/core/info_oss.c index f9e4ce443454..12107968d402 100644 --- a/sound/core/info_oss.c +++ b/sound/core/info_oss.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -51,7 +52,7 @@ int snd_oss_info_register(int dev, int num, char *string) x = NULL; } } else { - x = snd_kmalloc_strdup(string, GFP_KERNEL); + x = kstrdup(string, GFP_KERNEL); if (x == NULL) { up(&strings); return -ENOMEM; diff --git a/sound/core/memory.c b/sound/core/memory.c index 20860fec9364..c1fb28e84330 100644 --- a/sound/core/memory.c +++ b/sound/core/memory.c @@ -184,6 +184,20 @@ void snd_hidden_vfree(void *obj) snd_wrapper_vfree(obj); } +char *snd_hidden_kstrdup(const char *s, int flags) +{ + int len; + char *buf; + + if (!s) return NULL; + + len = strlen(s) + 1; + buf = _snd_kmalloc(len, flags); + if (buf) + memcpy(buf, s, len); + return buf; +} + static void snd_memory_info_read(snd_info_entry_t *entry, snd_info_buffer_t * buffer) { snd_iprintf(buffer, "kmalloc: %li bytes\n", snd_alloc_kmalloc); @@ -214,35 +228,8 @@ int __exit snd_memory_info_done(void) return 0; } -#else - -#define _snd_kmalloc kmalloc - #endif /* CONFIG_SND_DEBUG_MEMORY */ -/** - * snd_kmalloc_strdup - copy the string - * @string: the original string - * @flags: allocation conditions, GFP_XXX - * - * Allocates a memory chunk via kmalloc() and copies the string to it. - * - * Returns the pointer, or NULL if no enoguh memory. - */ -char *snd_kmalloc_strdup(const char *string, int flags) -{ - size_t len; - char *ptr; - - if (!string) - return NULL; - len = strlen(string) + 1; - ptr = _snd_kmalloc(len, flags); - if (ptr) - memcpy(ptr, string, len); - return ptr; -} - /** * copy_to_user_fromio - copy data from mmio-space to user-space * @dst: the destination pointer on user-space diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c index 98ed9a9f0da6..98fc0766f885 100644 --- a/sound/core/oss/mixer_oss.c +++ b/sound/core/oss/mixer_oss.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1137,7 +1138,7 @@ static void snd_mixer_oss_proc_write(snd_info_entry_t *entry, goto __unlock; } tbl->oss_id = ch; - tbl->name = snd_kmalloc_strdup(str, GFP_KERNEL); + tbl->name = kstrdup(str, GFP_KERNEL); if (! tbl->name) { kfree(tbl); goto __unlock; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index cab30977e7c0..de7444c586f9 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -2360,7 +2361,7 @@ static void snd_pcm_oss_proc_write(snd_info_entry_t *entry, for (setup1 = pstr->oss.setup_list; setup1->next; setup1 = setup1->next); setup1->next = setup; } - template.task_name = snd_kmalloc_strdup(task_name, GFP_KERNEL); + template.task_name = kstrdup(task_name, GFP_KERNEL); } else { buffer->error = -ENOMEM; } diff --git a/sound/core/sound.c b/sound/core/sound.c index 0815fadeb3ec..7612884f530b 100644 --- a/sound/core/sound.c +++ b/sound/core/sound.c @@ -399,8 +399,8 @@ EXPORT_SYMBOL(snd_hidden_kcalloc); EXPORT_SYMBOL(snd_hidden_kfree); EXPORT_SYMBOL(snd_hidden_vmalloc); EXPORT_SYMBOL(snd_hidden_vfree); +EXPORT_SYMBOL(snd_hidden_kstrdup); #endif -EXPORT_SYMBOL(snd_kmalloc_strdup); EXPORT_SYMBOL(copy_to_user_fromio); EXPORT_SYMBOL(copy_from_user_toio); /* init.c */ diff --git a/sound/core/timer.c b/sound/core/timer.c index b498e5482d77..cfaccd415b3b 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -100,7 +101,7 @@ static snd_timer_instance_t *snd_timer_instance_new(char *owner, snd_timer_t *ti timeri = kcalloc(1, sizeof(*timeri), GFP_KERNEL); if (timeri == NULL) return NULL; - timeri->owner = snd_kmalloc_strdup(owner, GFP_KERNEL); + timeri->owner = kstrdup(owner, GFP_KERNEL); if (! timeri->owner) { kfree(timeri); return NULL; diff --git a/sound/isa/gus/gus_mem.c b/sound/isa/gus/gus_mem.c index 609838e8ef67..5eb766dd564b 100644 --- a/sound/isa/gus/gus_mem.c +++ b/sound/isa/gus/gus_mem.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -213,7 +214,7 @@ snd_gf1_mem_block_t *snd_gf1_mem_alloc(snd_gf1_mem_t * alloc, int owner, if (share_id != NULL) memcpy(&block.share_id, share_id, sizeof(block.share_id)); block.owner = owner; - block.name = snd_kmalloc_strdup(name, GFP_KERNEL); + block.name = kstrdup(name, GFP_KERNEL); nblock = snd_gf1_mem_xalloc(alloc, &block); snd_gf1_mem_lock(alloc, 1); return nblock; @@ -253,13 +254,13 @@ int snd_gf1_mem_init(snd_gus_card_t * gus) if (gus->gf1.enh_mode) { block.ptr = 0; block.size = 1024; - block.name = snd_kmalloc_strdup("InterWave LFOs", GFP_KERNEL); + block.name = kstrdup("InterWave LFOs", GFP_KERNEL); if (snd_gf1_mem_xalloc(alloc, &block) == NULL) return -ENOMEM; } block.ptr = gus->gf1.default_voice_address; block.size = 4; - block.name = snd_kmalloc_strdup("Voice default (NULL's)", GFP_KERNEL); + block.name = kstrdup("Voice default (NULL's)", GFP_KERNEL); if (snd_gf1_mem_xalloc(alloc, &block) == NULL) return -ENOMEM; #ifdef CONFIG_SND_DEBUG diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9edd558d6bd3..bab89843d850 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -1781,7 +1781,7 @@ static int add_control(struct alc_spec *spec, int type, const char *name, unsign knew = &spec->kctl_alloc[spec->num_kctl_used]; *knew = alc880_control_templates[type]; - knew->name = snd_kmalloc_strdup(name, GFP_KERNEL); + knew->name = kstrdup(name, GFP_KERNEL); if (! knew->name) return -ENOMEM; knew->private_value = val; diff --git a/sound/synth/emux/emux.c b/sound/synth/emux/emux.c index 16f3b461627a..60d0b2c66698 100644 --- a/sound/synth/emux/emux.c +++ b/sound/synth/emux/emux.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,7 @@ int snd_emux_register(snd_emux_t *emu, snd_card_t *card, int index, char *name) snd_assert(name != NULL, return -EINVAL); emu->card = card; - emu->name = snd_kmalloc_strdup(name, GFP_KERNEL); + emu->name = kstrdup(name, GFP_KERNEL); emu->voices = kcalloc(emu->max_voices, sizeof(snd_emux_voice_t), GFP_KERNEL); if (emu->voices == NULL) return -ENOMEM; From dfe52244e004f5103478966cd88351feb5c50d79 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Thu, 23 Jun 2005 00:09:04 -0700 Subject: [PATCH 107/199] [PATCH] kstrdup: convert a few existing implementations Convert a bunch of strdup() implementations and their callers to the new kstrdup(). A few remain, for example see sound/core, and there are tons of open coded strdup()'s around. Sigh. But this is a start. Signed-off-by: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/process_kern.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c index 804c6bbdf67c..157584ae4792 100644 --- a/arch/um/kernel/process_kern.c +++ b/arch/um/kernel/process_kern.c @@ -8,6 +8,7 @@ #include "linux/kernel.h" #include "linux/sched.h" #include "linux/interrupt.h" +#include "linux/string.h" #include "linux/mm.h" #include "linux/slab.h" #include "linux/utsname.h" @@ -322,12 +323,7 @@ void do_uml_exitcalls(void) char *uml_strdup(char *string) { - char *new; - - new = kmalloc(strlen(string) + 1, GFP_KERNEL); - if(new == NULL) return(NULL); - strcpy(new, string); - return(new); + return kstrdup(string, GFP_KERNEL); } int copy_to_user_proc(void __user *to, void *from, int size) From ab4af03a4054bd78bcabfb2214c9597201beae35 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Thu, 23 Jun 2005 00:09:05 -0700 Subject: [PATCH 108/199] [PATCH] CON_CONSDEV bit not set correctly on last console According to include/linux/console.h, CON_CONSDEV flag should be set on the last console specified on the boot command line: 86 #define CON_PRINTBUFFER (1) 87 #define CON_CONSDEV (2) /* Last on the command line */ 88 #define CON_ENABLED (4) 89 #define CON_BOOT (8) This does not currently happen if there is more than one console specified on the boot commandline. Instead, it gets set on the first console on the command line. This can cause problems for things like kdb that look for the CON_CONSDEV flag to see if the console is valid. Additionaly, it doesn't look like CON_CONSDEV is reassigned to the next preferred console at unregister time if the console being unregistered currently has that bit set. Example (from sn2 ia64): elilo vmlinuz root= console=ttyS0 console=ttySG0 in this case, the flags on ttySG console struct will be 0x4 (should be 0x6). Attached patch against bk fixes both issues for the cases I looked at. It uses selected_console (which gets incremented for each console specified on the command line) as the indicator of which console to set CON_CONSDEV on. When adding the console to the list, if the previous one had CON_CONSDEV set, it masks it out. Tested on ia64 and x86. The problem with the current behavior is it breaks overriding the default from the boot line. In the ia64 case, there may be a global append line defining console=a in elilo.conf. Then you want to boot your kernel, and want to override the default by passing console=b on the boot line. elilo constructs the kernel cmdline by starting with the value of the global append line, then tacks on whatever else you specify, which puts console=b last. Signed-off-by: Greg Edwards Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 01b58d7d17ff..3a442bfb8bee 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -876,8 +876,10 @@ void register_console(struct console * console) break; console->flags |= CON_ENABLED; console->index = console_cmdline[i].index; - if (i == preferred_console) + if (i == selected_console) { console->flags |= CON_CONSDEV; + preferred_console = selected_console; + } break; } @@ -897,6 +899,8 @@ void register_console(struct console * console) if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { console->next = console_drivers; console_drivers = console; + if (console->next) + console->next->flags &= ~CON_CONSDEV; } else { console->next = console_drivers->next; console_drivers->next = console; @@ -937,10 +941,14 @@ int unregister_console(struct console * console) /* If last console is removed, we re-enable picking the first * one that gets registered. Without that, pmac early boot console * would prevent fbcon from taking over. + * + * If this isn't the last console and it has CON_CONSDEV set, we + * need to set it on the next preferred console. */ if (console_drivers == NULL) preferred_console = selected_console; - + else if (console->flags & CON_CONSDEV) + console_drivers->flags |= CON_CONSDEV; release_console_sem(); return res; From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:09:06 -0700 Subject: [PATCH 109/199] [PATCH] optimise loop driver a bit Looks like locking can be optimised quite a lot. Increase lock widths slightly so lo_lock is taken fewer times per request. Also it was quite trivial to cover lo_pending with that lock, and remove the atomic requirement. This also makes memory ordering explicitly correct, which is nice (not that I particularly saw any mem ordering bugs). Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K block size, 4K page size). System is 2 socket Xeon with HT (4 thread). intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh Before: 0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k After: 0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 81 +++++++++++++++++++++----------------------- include/linux/loop.h | 2 +- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6f011d0d8e97..b35e08876dd4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -472,17 +472,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) */ static void loop_add_bio(struct loop_device *lo, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&lo->lo_lock, flags); if (lo->lo_biotail) { lo->lo_biotail->bi_next = bio; lo->lo_biotail = bio; } else lo->lo_bio = lo->lo_biotail = bio; - spin_unlock_irqrestore(&lo->lo_lock, flags); - - up(&lo->lo_bh_mutex); } /* @@ -492,14 +486,12 @@ static struct bio *loop_get_bio(struct loop_device *lo) { struct bio *bio; - spin_lock_irq(&lo->lo_lock); if ((bio = lo->lo_bio)) { if (bio == lo->lo_biotail) lo->lo_biotail = NULL; lo->lo_bio = bio->bi_next; bio->bi_next = NULL; } - spin_unlock_irq(&lo->lo_lock); return bio; } @@ -509,35 +501,28 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio) struct loop_device *lo = q->queuedata; int rw = bio_rw(old_bio); - if (!lo) - goto out; + if (rw == READA) + rw = READ; + + BUG_ON(!lo || (rw != READ && rw != WRITE)); spin_lock_irq(&lo->lo_lock); if (lo->lo_state != Lo_bound) - goto inactive; - atomic_inc(&lo->lo_pending); - spin_unlock_irq(&lo->lo_lock); - - if (rw == WRITE) { - if (lo->lo_flags & LO_FLAGS_READ_ONLY) - goto err; - } else if (rw == READA) { - rw = READ; - } else if (rw != READ) { - printk(KERN_ERR "loop: unknown command (%x)\n", rw); - goto err; - } + goto out; + if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) + goto out; + lo->lo_pending++; loop_add_bio(lo, old_bio); + spin_unlock_irq(&lo->lo_lock); + up(&lo->lo_bh_mutex); return 0; -err: - if (atomic_dec_and_test(&lo->lo_pending)) - up(&lo->lo_bh_mutex); + out: + if (lo->lo_pending == 0) + up(&lo->lo_bh_mutex); + spin_unlock_irq(&lo->lo_lock); bio_io_error(old_bio, old_bio->bi_size); return 0; -inactive: - spin_unlock_irq(&lo->lo_lock); - goto out; } /* @@ -560,13 +545,11 @@ static void do_loop_switch(struct loop_device *, struct switch_request *); static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) { - int ret; - if (unlikely(!bio->bi_bdev)) { do_loop_switch(lo, bio->bi_private); bio_put(bio); } else { - ret = do_bio_filebacked(lo, bio); + int ret = do_bio_filebacked(lo, bio); bio_endio(bio, bio->bi_size, ret); } } @@ -594,7 +577,7 @@ static int loop_thread(void *data) set_user_nice(current, -20); lo->lo_state = Lo_bound; - atomic_inc(&lo->lo_pending); + lo->lo_pending = 1; /* * up sem, we are running @@ -602,26 +585,37 @@ static int loop_thread(void *data) up(&lo->lo_sem); for (;;) { - down_interruptible(&lo->lo_bh_mutex); + int pending; + /* - * could be upped because of tear-down, not because of - * pending work + * interruptible just to not contribute to load avg */ - if (!atomic_read(&lo->lo_pending)) + if (down_interruptible(&lo->lo_bh_mutex)) + continue; + + spin_lock_irq(&lo->lo_lock); + + /* + * could be upped because of tear-down, not pending work + */ + if (unlikely(!lo->lo_pending)) { + spin_unlock_irq(&lo->lo_lock); break; + } bio = loop_get_bio(lo); - if (!bio) { - printk("loop: missing bio\n"); - continue; - } + lo->lo_pending--; + pending = lo->lo_pending; + spin_unlock_irq(&lo->lo_lock); + + BUG_ON(!bio); loop_handle_bio(lo, bio); /* * upped both for pending work and tear-down, lo_pending * will hit zero then */ - if (atomic_dec_and_test(&lo->lo_pending)) + if (unlikely(!pending)) break; } @@ -900,7 +894,8 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - if (atomic_dec_and_test(&lo->lo_pending)) + lo->lo_pending--; + if (!lo->lo_pending) up(&lo->lo_bh_mutex); spin_unlock_irq(&lo->lo_lock); diff --git a/include/linux/loop.h b/include/linux/loop.h index 8220d9c9da00..53fa51595443 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -61,7 +61,7 @@ struct loop_device { struct semaphore lo_sem; struct semaphore lo_ctl_mutex; struct semaphore lo_bh_mutex; - atomic_t lo_pending; + int lo_pending; request_queue_t *lo_queue; }; From dcd497f99a1ef29a7c5e76142965be77e9dacabd Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 23 Jun 2005 00:09:07 -0700 Subject: [PATCH 110/199] [PATCH] streamline preempt_count type across archs The preempt_count member of struct thread_info is currently either defined as int, unsigned int or __s32 depending on arch. This patch makes the type of preempt_count an int on all archs. Having preempt_count be an unsigned type prevents the catching of preempt_count < 0 bugs, and using int on some archs and __s32 on others is not exactely "neat" - much nicer when it's just int all over. A previous version of this patch was already ACK'ed by Robert Love, and the only change in this version of the patch compared to the one he ACK'ed is that this one also makes sure the preempt_count member is consistently commented. Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-arm/thread_info.h | 2 +- include/asm-arm26/thread_info.h | 2 +- include/asm-cris/thread_info.h | 2 +- include/asm-frv/thread_info.h | 2 +- include/asm-h8300/thread_info.h | 2 +- include/asm-i386/thread_info.h | 2 +- include/asm-ia64/thread_info.h | 2 +- include/asm-m32r/thread_info.h | 2 +- include/asm-m68k/thread_info.h | 2 +- include/asm-m68knommu/thread_info.h | 2 +- include/asm-mips/thread_info.h | 2 +- include/asm-parisc/thread_info.h | 2 +- include/asm-ppc/thread_info.h | 3 ++- include/asm-ppc64/thread_info.h | 2 +- include/asm-s390/thread_info.h | 2 +- include/asm-sh/thread_info.h | 2 +- include/asm-sh64/thread_info.h | 2 +- include/asm-sparc/thread_info.h | 4 ++-- include/asm-sparc64/thread_info.h | 2 +- include/asm-um/thread_info.h | 2 +- include/asm-v850/thread_info.h | 3 ++- include/asm-x86_64/thread_info.h | 2 +- 22 files changed, 25 insertions(+), 23 deletions(-) diff --git a/include/asm-arm/thread_info.h b/include/asm-arm/thread_info.h index 66c585c50cf9..8252a4cd860f 100644 --- a/include/asm-arm/thread_info.h +++ b/include/asm-arm/thread_info.h @@ -49,7 +49,7 @@ struct cpu_context_save { */ struct thread_info { unsigned long flags; /* low level flags */ - __s32 preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ diff --git a/include/asm-arm26/thread_info.h b/include/asm-arm26/thread_info.h index 50f41b50268a..aff3e5699c64 100644 --- a/include/asm-arm26/thread_info.h +++ b/include/asm-arm26/thread_info.h @@ -44,7 +44,7 @@ struct cpu_context_save { */ struct thread_info { unsigned long flags; /* low level flags */ - __s32 preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h index 53193feb0826..5ba4b7865cc5 100644 --- a/include/asm-cris/thread_info.h +++ b/include/asm-cris/thread_info.h @@ -31,7 +31,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-frv/thread_info.h b/include/asm-frv/thread_info.h index b80a97f50af6..c8cba7836f0d 100644 --- a/include/asm-frv/thread_info.h +++ b/include/asm-frv/thread_info.h @@ -33,7 +33,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-h8300/thread_info.h b/include/asm-h8300/thread_info.h index b07c9344776f..bfcc755c3bb1 100644 --- a/include/asm-h8300/thread_info.h +++ b/include/asm-h8300/thread_info.h @@ -23,7 +23,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; /* 0 => preemptable, <0 => BUG*/ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index 2cd57271801d..95add81237ea 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -31,7 +31,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h index 8d5b7e77028c..7dc8951708a3 100644 --- a/include/asm-ia64/thread_info.h +++ b/include/asm-ia64/thread_info.h @@ -25,7 +25,7 @@ struct thread_info { __u32 flags; /* thread_info flags (see TIF_*) */ __u32 cpu; /* current CPU */ mm_segment_t addr_limit; /* user-level address space limit */ - __s32 preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ + int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; struct { int signo; diff --git a/include/asm-m32r/thread_info.h b/include/asm-m32r/thread_info.h index 9f3a0fcf6e2b..7a6be7727a92 100644 --- a/include/asm-m32r/thread_info.h +++ b/include/asm-m32r/thread_info.h @@ -28,7 +28,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thread diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h index 5f58939c59db..2aed24f6fd2e 100644 --- a/include/asm-m68k/thread_info.h +++ b/include/asm-m68k/thread_info.h @@ -8,7 +8,7 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ struct restart_block restart_block; diff --git a/include/asm-m68knommu/thread_info.h b/include/asm-m68knommu/thread_info.h index c8153b7c1f5c..7b9a3fa3af5d 100644 --- a/include/asm-m68knommu/thread_info.h +++ b/include/asm-m68knommu/thread_info.h @@ -36,7 +36,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; /* 0 => preemptable, <0 => BUG*/ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h index 768900305e2f..42fcd6f2c206 100644 --- a/include/asm-mips/thread_info.h +++ b/include/asm-mips/thread_info.h @@ -27,7 +27,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h index fe9b7f8ae4c6..57bbb76cb6c1 100644 --- a/include/asm-parisc/thread_info.h +++ b/include/asm-parisc/thread_info.h @@ -12,7 +12,7 @@ struct thread_info { unsigned long flags; /* thread_info flags (see TIF_*) */ mm_segment_t addr_limit; /* user-level address space limit */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ + int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; }; diff --git a/include/asm-ppc/thread_info.h b/include/asm-ppc/thread_info.h index e3b5284a6f91..27903db42efc 100644 --- a/include/asm-ppc/thread_info.h +++ b/include/asm-ppc/thread_info.h @@ -20,7 +20,8 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long local_flags; /* non-racy flags */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h index 48b7900e90ec..0494df6fca74 100644 --- a/include/asm-ppc64/thread_info.h +++ b/include/asm-ppc64/thread_info.h @@ -24,7 +24,7 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; /* set by force_successful_syscall_return */ unsigned char syscall_noerror; diff --git a/include/asm-s390/thread_info.h b/include/asm-s390/thread_info.h index aade85c53a63..fe101d41e849 100644 --- a/include/asm-s390/thread_info.h +++ b/include/asm-s390/thread_info.h @@ -50,7 +50,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ unsigned int cpu; /* current CPU */ - unsigned int preempt_count; /* 0 => preemptable */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-sh/thread_info.h b/include/asm-sh/thread_info.h index 4bbbd9f3c37e..46080cefaff8 100644 --- a/include/asm-sh/thread_info.h +++ b/include/asm-sh/thread_info.h @@ -20,7 +20,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ __u32 flags; /* low level flags */ __u32 cpu; - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; __u8 supervisor_stack[0]; }; diff --git a/include/asm-sh64/thread_info.h b/include/asm-sh64/thread_info.h index 8a32d6bd0b79..10f024c6a2e3 100644 --- a/include/asm-sh64/thread_info.h +++ b/include/asm-sh64/thread_info.h @@ -22,7 +22,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ /* Put the 4 32-bit fields together to make asm offsetting easier. */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ __u16 cpu; mm_segment_t addr_limit; diff --git a/include/asm-sparc/thread_info.h b/include/asm-sparc/thread_info.h index 104f03c55416..ff6ccb3d24c6 100644 --- a/include/asm-sparc/thread_info.h +++ b/include/asm-sparc/thread_info.h @@ -30,9 +30,9 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ - int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ int softirq_count; int hardirq_count; diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h index 517caaba1c87..0cd652956929 100644 --- a/include/asm-sparc64/thread_info.h +++ b/include/asm-sparc64/thread_info.h @@ -46,7 +46,7 @@ struct thread_info { unsigned long fault_address; struct pt_regs *kregs; struct exec_domain *exec_domain; - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ int __pad; unsigned long *utraps; diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h index 1feaaf148ef1..97267f059ef5 100644 --- a/include/asm-um/thread_info.h +++ b/include/asm-um/thread_info.h @@ -17,7 +17,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user diff --git a/include/asm-v850/thread_info.h b/include/asm-v850/thread_info.h index e2ef44593752..e4cfad94a553 100644 --- a/include/asm-v850/thread_info.h +++ b/include/asm-v850/thread_info.h @@ -30,7 +30,8 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index f4b3b249639c..08eb6e4f3737 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -29,7 +29,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; struct restart_block restart_block; From be5b4fbd017d12e0d09ea0528a5839ce2ed2c8c8 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 23 Jun 2005 00:09:09 -0700 Subject: [PATCH 111/199] [PATCH] preempt_count is int - remove cast and don't assign to unsigned type In kernel/sched.c the return value from preempt_count() is cast to an int. That made sense when preempt_count was defined as different types on is not needed and should go away. The patch removes the cast. In kernel/timer.c the return value from preempt_count() is assigned to a variable of type u32 and then that unsigned value is later compared to preempt_count(). Since preempt_count() returns an int, an int is what should be used to store its return value. Storing the result in an unsigned 32bit integer made a tiny bit of sense back when preempt_count was different types on different archs, but no more - let's not play signed vs unsigned comparison games when we don't have to. The patch modifies the code to use an int to hold the value. While I was around that bit of code I also made two changes to a nearby (related) printk() - I modified it to specify the loglevel explicitly and also broke the line into a few pieces to avoid it being longer than 80 chars and clarified the text a bit. Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- kernel/timer.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index deca041fc364..6ee4515d5a20 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2576,7 +2576,7 @@ void fastcall add_preempt_count(int val) /* * Underflow? */ - BUG_ON(((int)preempt_count() < 0)); + BUG_ON((preempt_count() < 0)); preempt_count() += val; /* * Spinlock count overflowing soon? diff --git a/kernel/timer.c b/kernel/timer.c index 1f986c16d89f..51ff917c9590 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -489,10 +489,14 @@ static inline void __run_timers(tvec_base_t *base) detach_timer(timer, 1); spin_unlock_irq(&base->t_base.lock); { - u32 preempt_count = preempt_count(); + int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); + printk(KERN_WARNING "huh, entered %p " + "with preempt_count %08x, exited" + " with %08x?\n", + fn, preempt_count, + preempt_count()); BUG(); } } From 3bc1ee3e8f1c05c0f64a479c6d56eb34a6190599 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 23 Jun 2005 00:09:09 -0700 Subject: [PATCH 112/199] [PATCH] remove redundant vm_flags clearing from madvise.c This patch removes redundant VM_ClearReadHint from mm/madvice.c which was left there by Prasanna's patch. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index e3108054733c..54a5d3bc55d5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -65,7 +65,6 @@ static long madvise_behavior(struct vm_area_struct * vma, /* * vm_flags is protected by the mmap_sem held in write mode. */ - VM_ClearReadHint(vma); vma->vm_flags = new_flags; out: From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 23 Jun 2005 00:09:11 -0700 Subject: [PATCH 113/199] [PATCH] add check to /proc/devices read routines Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads of /proc/devices from spilling over the provided page if more than 4096 bytes of string data are generated from all the registered character and block devices in a system Signed-off-by: Neil Horman Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/genhd.c | 12 ++++++++++-- fs/char_dev.c | 13 ++++++++++++- fs/proc/proc_misc.c | 2 +- include/linux/genhd.h | 2 +- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c index 43805e4d31e9..47fd3659a061 100644 --- a/drivers/block/genhd.c +++ b/drivers/block/genhd.c @@ -40,7 +40,7 @@ static inline int major_to_index(int major) #ifdef CONFIG_PROC_FS /* get block device names in somewhat random order */ -int get_blkdev_list(char *p) +int get_blkdev_list(char *p, int used) { struct blk_major_name *n; int i, len; @@ -49,10 +49,18 @@ int get_blkdev_list(char *p) down(&block_subsys_sem); for (i = 0; i < ARRAY_SIZE(major_names); i++) { - for (n = major_names[i]; n; n = n->next) + for (n = major_names[i]; n; n = n->next) { + /* + * If the curent string plus the 5 extra characters + * in the line would run us off the page, then we're done + */ + if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE) + goto page_full; len += sprintf(p+len, "%3d %s\n", n->major, n->name); + } } +page_full: up(&block_subsys_sem); return len; diff --git a/fs/char_dev.c b/fs/char_dev.c index c1e3537909fc..e82aac9cc2f5 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -56,10 +56,21 @@ int get_chrdev_list(char *page) down(&chrdevs_lock); for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) { - for (cd = chrdevs[i]; cd; cd = cd->next) + for (cd = chrdevs[i]; cd; cd = cd->next) { + /* + * if the current name, plus the 5 extra characters + * in the device line for this entry + * would run us off the page, we're done + */ + if ((len+strlen(cd->name) + 5) >= PAGE_SIZE) + goto page_full; + + len += sprintf(page+len, "%3d %s\n", cd->major, cd->name); + } } +page_full: up(&chrdevs_lock); return len; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 63a9fbf1ac51..94b570ad037d 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -451,7 +451,7 @@ static int devices_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = get_chrdev_list(page); - len += get_blkdev_list(page+len); + len += get_blkdev_list(page+len, len); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index af26dc718ef6..01796c41c951 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk) extern void disk_round_stats(struct gendisk *disk); /* drivers/block/genhd.c */ -extern int get_blkdev_list(char *); +extern int get_blkdev_list(char *, int); extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); From 5f45f1a78fbac3cc859ec10c5366e97d20d40fa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:12 -0700 Subject: [PATCH 114/199] [PATCH] remove duplicate get_dentry functions in various places Various filesystem drivers have grown a get_dentry() function that's a duplicate of lookup_one_len, except that it doesn't take a maximum length argument and doesn't check for \0 or / in the passed in filename. Switch all these places to use lookup_one_len. Signed-off-by: Christoph Hellwig Cc: Greg KH Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/core/inode.c | 13 +------------ fs/debugfs/inode.c | 12 +----------- fs/sysfs/dir.c | 5 +++-- fs/sysfs/file.c | 5 +++-- fs/sysfs/group.c | 4 +++- fs/sysfs/inode.c | 10 ---------- fs/sysfs/sysfs.h | 1 - kernel/cpuset.c | 8 +------- 8 files changed, 12 insertions(+), 46 deletions(-) diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index f9f9561c6bad..c3e3a95d3804 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -453,17 +453,6 @@ static int usbfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry * get_dentry(struct dentry *parent, const char *name) -{ - struct qstr qstr; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); - return lookup_hash(&qstr,parent); -} - - /* * fs_create_by_name - create a file, given a name * @name: name of file @@ -496,7 +485,7 @@ static int fs_create_by_name (const char *name, mode_t mode, *dentry = NULL; down(&parent->d_inode->i_sem); - *dentry = get_dentry (parent, name); + *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { if ((mode & S_IFMT) == S_IFDIR) error = usbfs_mkdir (parent->d_inode, *dentry, mode); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b529786699e7..a86ac4aeaedb 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -110,16 +110,6 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); } -static struct dentry * get_dentry(struct dentry *parent, const char *name) -{ - struct qstr qstr; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); - return lookup_hash(&qstr,parent); -} - static struct super_block *debug_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -157,7 +147,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, *dentry = NULL; down(&parent->d_inode->i_sem); - *dentry = get_dentry (parent, name); + *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { if ((mode & S_IFMT) == S_IFDIR) error = debugfs_mkdir(parent->d_inode, *dentry, mode); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 37d7a6875d86..59734ba1ee60 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); @@ -99,7 +100,7 @@ static int create_dir(struct kobject * k, struct dentry * p, umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; down(&p->d_inode->i_sem); - *d = sysfs_get_dentry(p,n); + *d = lookup_one_len(n, p, strlen(n)); if (!IS_ERR(*d)) { error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR); if (!error) { @@ -315,7 +316,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) down(&parent->d_inode->i_sem); - new_dentry = sysfs_get_dentry(parent, new_name); + new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { if (!new_dentry->d_inode) { error = kobject_set_name(kobj, "%s", new_name); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 849aac115460..e9cfa39f4099 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -400,7 +401,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) int res = -ENOENT; down(&dir->d_inode->i_sem); - victim = sysfs_get_dentry(dir, attr->name); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { /* make sure dentry is really there */ if (victim->d_inode && @@ -443,7 +444,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) int res = -ENOENT; down(&dir->d_inode->i_sem); - victim = sysfs_get_dentry(dir, attr->name); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { if (victim->d_inode && (victim->d_parent->d_inode == dir->d_inode)) { diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index f11ac5ea7021..122145b0895c 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -68,7 +69,8 @@ void sysfs_remove_group(struct kobject * kobj, struct dentry * dir; if (grp->name) - dir = sysfs_get_dentry(kobj->dentry,grp->name); + dir = lookup_one_len(grp->name, kobj->dentry, + strlen(grp->name)); else dir = dget(kobj->dentry); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 565cac1d4200..8de13bafaa76 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -166,16 +166,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) return error; } -struct dentry * sysfs_get_dentry(struct dentry * parent, const char * name) -{ - struct qstr qstr; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); - return lookup_hash(&qstr,parent); -} - /* * Get the name for corresponding element represented by the given sysfs_dirent */ diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 29da6f5f07c8..3f8953e0e5d0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -7,7 +7,6 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, umode_t, int); -extern struct dentry * sysfs_get_dentry(struct dentry *, const char *); extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00e8f2575512..79dd929f4084 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = { static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) { - struct qstr qstr; - struct dentry *d; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name, qstr.len); - d = lookup_hash(&qstr, parent); + struct dentry *d = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(d)) d->d_op = &cpuset_dops; return d; From df164db5fd16888ddbe2a63a47b2f6dda9a428b5 Mon Sep 17 00:00:00 2001 From: Alexander Nyberg Date: Thu, 23 Jun 2005 00:09:13 -0700 Subject: [PATCH 115/199] [PATCH] avoid resursive oopses Prevent recursive faults in do_exit() by leaving the task alone and wait for reboot. This may allow a more graceful shutdown and possibly save the original oops. Signed-off-by: Alexander Nyberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index 2ef2ad540201..c2bdf6fb61a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -793,6 +793,17 @@ fastcall NORET_TYPE void do_exit(long code) ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); } + /* + * We're taking recursive faults here in do_exit. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + tsk->flags |= PF_EXITING; /* From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:16 -0700 Subject: [PATCH 116/199] [PATCH] quota: consolidate code surrounding vfs_quota_on_mount Move some code duplicated in both callers into vfs_quota_on_mount Signed-off-by: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dquot.c | 23 +++++++++++++++++++---- fs/ext3/super.c | 18 ++---------------- fs/reiserfs/super.c | 21 +++------------------ include/linux/quotaops.h | 3 ++- 4 files changed, 26 insertions(+), 39 deletions(-) diff --git a/fs/dquot.c b/fs/dquot.c index 3995ce7907cc..343c03655619 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -1519,14 +1519,29 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) * This function is used when filesystem needs to initialize quotas * during mount time. */ -int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry) +int vfs_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type) { + struct qstr name = {.name = qf_name, .len = 0, .len = strlen(qf_name)}; + struct dentry *dentry; int error; + dentry = lookup_hash(&name, sb->s_root); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + error = security_quota_on(dentry); - if (error) - return error; - return vfs_quota_on_inode(dentry->d_inode, type, format_id); + if (!error) + error = vfs_quota_on_inode(dentry->d_inode, type, format_id); + + /* + * Now invalidate and put the dentry - quota got its own reference + * to inode and dentry has at least wrong hash so we had better + * throw it away. + */ + d_invalidate(dentry); + dput(dentry); + return error; } /* Generic routine for getting common part of quota structure */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 981ccb233ef5..9630fbfdc24a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2348,22 +2348,8 @@ static int ext3_write_info(struct super_block *sb, int type) */ static int ext3_quota_on_mount(struct super_block *sb, int type) { - int err; - struct dentry *dentry; - struct qstr name = { .name = EXT3_SB(sb)->s_qf_names[type], - .hash = 0, - .len = strlen(EXT3_SB(sb)->s_qf_names[type])}; - - dentry = lookup_hash(&name, sb->s_root); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry); - /* Now invalidate and put the dentry - quota got its own reference - * to inode and dentry has at least wrong hash so we had better - * throw it away */ - d_invalidate(dentry); - dput(dentry); - return err; + return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); } /* diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b35b87744983..aae0779ed5b4 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1932,27 +1932,12 @@ static int reiserfs_write_info(struct super_block *sb, int type) } /* - * Turn on quotas during mount time - we need to find - * the quota file and such... + * Turn on quotas during mount time - we need to find the quota file and such... */ static int reiserfs_quota_on_mount(struct super_block *sb, int type) { - int err; - struct dentry *dentry; - struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type], - .hash = 0, - .len = strlen(REISERFS_SB(sb)->s_qf_names[type])}; - - dentry = lookup_hash(&name, sb->s_root); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry); - /* Now invalidate and put the dentry - quota got its own reference - * to inode and dentry has at least wrong hash so we had better - * throw it away */ - d_invalidate(dentry); - dput(dentry); - return err; + return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); } /* diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index e57baa85e744..d211507ab246 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type); extern int dquot_mark_dquot_dirty(struct dquot *dquot); extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path); -extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry); +extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type); extern int vfs_quota_off(struct super_block *sb, int type); #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type) extern int vfs_quota_sync(struct super_block *sb, int type); From 2fa389c5eb8c97d621653184d2adf5fdbd4a3167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:16 -0700 Subject: [PATCH 117/199] [PATCH] quota: sanitize dentry handling in vfs_quota_on_mount Use lookup_one_len instead of opencoding a simplified lookup using lookup_hash with a fake hash. Also there's no need anymore for the d_invalidate as we have a completely valid dentry now. Signed-off-by: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dquot.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/dquot.c b/fs/dquot.c index 343c03655619..37212b039a4a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -1522,11 +1522,10 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) int vfs_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { - struct qstr name = {.name = qf_name, .len = 0, .len = strlen(qf_name)}; struct dentry *dentry; int error; - dentry = lookup_hash(&name, sb->s_root); + dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1534,12 +1533,6 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, if (!error) error = vfs_quota_on_inode(dentry->d_inode, type, format_id); - /* - * Now invalidate and put the dentry - quota got its own reference - * to inode and dentry has at least wrong hash so we had better - * throw it away. - */ - d_invalidate(dentry); dput(dentry); return error; } From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:19 -0700 Subject: [PATCH 118/199] [PATCH] kprobes: function-return probes This patch adds function-return probes to kprobes for the i386 architecture. This enables you to establish a handler to be run when a function returns. 1. API Two new functions are added to kprobes: int register_kretprobe(struct kretprobe *rp); void unregister_kretprobe(struct kretprobe *rp); 2. Registration and unregistration 2.1 Register To register a function-return probe, the user populates the following fields in a kretprobe object and calls register_kretprobe() with the kretprobe address as an argument: kp.addr - the function's address handler - this function is run after the ret instruction executes, but before control returns to the return address in the caller. maxactive - The maximum number of instances of the probed function that can be active concurrently. For example, if the function is non- recursive and is called with a spinlock or mutex held, maxactive = 1 should be enough. If the function is non-recursive and can never relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should be enough. maxactive is used to determine how many kretprobe_instance objects to allocate for this particular probed function. If maxactive <= 0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 * NR_CPUS) else maxactive=NR_CPUS) For example: struct kretprobe rp; rp.kp.addr = /* entrypoint address */ rp.handler = /*return probe handler */ rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */ register_kretprobe(&rp); The following field may also be of interest: nmissed - Initialized to zero when the function-return probe is registered, and incremented every time the probed function is entered but there is no kretprobe_instance object available for establishing the function-return probe (i.e., because maxactive was set too low). 2.2 Unregister To unregiter a function-return probe, the user calls unregister_kretprobe() with the same kretprobe object as registered previously. If a probed function is running when the return probe is unregistered, the function will return as expected, but the handler won't be run. 3. Limitations 3.1 This patch supports only the i386 architecture, but patches for x86_64 and ppc64 are anticipated soon. 3.2 Return probes operates by replacing the return address in the stack (or in a known register, such as the lr register for ppc). This may cause __builtin_return_address(0), when invoked from the return-probed function, to return the address of the return-probes trampoline. 3.3 This implementation uses the "Multiprobes at an address" feature in 2.6.12-rc3-mm3. 3.4 Due to a limitation in multi-probes, you cannot currently establish a return probe and a jprobe on the same function. A patch to remove this limitation is being tested. This feature is required by SystemTap (http://sourceware.org/systemtap), and reflects ideas contributed by several SystemTap developers, including Will Cohen and Ananth Mavinakayanahalli. Signed-off-by: Hien Nguyen Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Frederik Deweerdt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 102 +++++++++++++++++- arch/i386/kernel/process.c | 15 +++ include/asm-i386/kprobes.h | 3 + include/linux/kprobes.h | 90 +++++++++++++++- kernel/kprobes.c | 213 ++++++++++++++++++++++++++++++++++++- 5 files changed, 415 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 59ff9b455069..048f754bbe23 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -23,6 +23,9 @@ * Rusty Russell). * 2004-July Suparna Bhattacharya added jumper probes * interface to access function arguments. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. */ #include @@ -91,6 +94,53 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->eip = (unsigned long)&p->ainsn.insn; } +struct task_struct *arch_get_kprobe_task(void *ptr) +{ + return ((struct thread_info *) (((unsigned long) ptr) & + (~(THREAD_SIZE -1))))->task; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)®s->esp; + struct kretprobe_instance *ri; + static void *orig_ret_addr; + + /* + * Save the return address when the return probe hits + * the first time, and use it to populate the (krprobe + * instance)->ret_addr for subsequent return probes at + * the same addrress since stack address would have + * the kretprobe_trampoline by then. + */ + if (((void*) *sara) != kretprobe_trampoline) + orig_ret_addr = (void*) *sara; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->stack_addr = sara; + ri->ret_addr = orig_ret_addr; + add_rp_inst(ri); + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; + } else { + rp->nmissed++; + } +} + +void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock) +{ + unsigned long flags = 0; + struct kretprobe_instance *ri; + spin_lock_irqsave(kp_lock, flags); + while ((ri = get_rp_inst_tsk(tk)) != NULL) { + *((unsigned long *)(ri->stack_addr)) = + (unsigned long) ri->ret_addr; + recycle_rp_inst(ri); + } + spin_unlock_irqrestore(kp_lock, flags); +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled thorough out this function. @@ -183,6 +233,55 @@ static int kprobe_handler(struct pt_regs *regs) return ret; } +/* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct task_struct *tsk; + struct kretprobe_instance *ri; + struct hlist_head *head; + struct hlist_node *node; + unsigned long *sara = ((unsigned long *) ®s->esp) - 1; + + tsk = arch_get_kprobe_task(sara); + head = kretprobe_inst_table_head(tsk); + + hlist_for_each_entry(ri, node, head, hlist) { + if (ri->stack_addr == sara && ri->rp) { + if (ri->rp->handler) + ri->rp->handler(ri, regs); + } + } + return 0; +} + +void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kretprobe_instance *ri; + /* RA already popped */ + unsigned long *sara = ((unsigned long *)®s->esp) - 1; + + while ((ri = get_rp_inst(sara))) { + regs->eip = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + } + regs->eflags &= ~TF_MASK; +} + /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "int 3" @@ -266,7 +365,8 @@ static inline int post_kprobe_handler(struct pt_regs *regs) if (current_kprobe->post_handler) current_kprobe->post_handler(current_kprobe, regs, 0); - resume_execution(current_kprobe, regs); + if (current_kprobe->post_handler != trampoline_post_handler) + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; unlock_kprobes(); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index be3efba7caf7..aea2ce1145df 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -339,6 +340,13 @@ void exit_thread(void) struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { int cpu = get_cpu(); @@ -362,6 +370,13 @@ void flush_thread(void) { struct task_struct *tsk = current; + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h index 4092f68d123a..8b6d3a90cd78 100644 --- a/include/asm-i386/kprobes.h +++ b/include/asm-i386/kprobes.h @@ -39,6 +39,9 @@ typedef u8 kprobe_opcode_t; : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define ARCH_SUPPORTS_KRETPROBES + +void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 99ddba5a4e00..fba39f87efec 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -25,21 +25,31 @@ * Rusty Russell). * 2004-July Suparna Bhattacharya added jumper probes * interface to access function arguments. + * 2005-May Hien Nguyen and Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. */ #include #include #include #include +#include + #include struct kprobe; struct pt_regs; +struct kretprobe; +struct kretprobe_instance; typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *); typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *); typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *, unsigned long flags); typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *, int trapnr); +typedef int (*kretprobe_handler_t) (struct kretprobe_instance *, + struct pt_regs *); + struct kprobe { struct hlist_node hlist; @@ -85,6 +95,62 @@ struct jprobe { kprobe_opcode_t *entry; /* probe handling code to jump to */ }; +#ifdef ARCH_SUPPORTS_KRETPROBES +extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs); +extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags); +extern struct task_struct *arch_get_kprobe_task(void *ptr); +extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); +extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +#else /* ARCH_SUPPORTS_KRETPROBES */ +static inline void kretprobe_trampoline(void) +{ +} +static inline int trampoline_probe_handler(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} +static inline void trampoline_post_handler(struct kprobe *p, + struct pt_regs *regs, unsigned long flags) +{ +} +static inline void arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ +} +static inline void arch_kprobe_flush_task(struct task_struct *tk) +{ +} +#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL) +#endif /* ARCH_SUPPORTS_KRETPROBES */ +/* + * Function-return probe - + * Note: + * User needs to provide a handler function, and initialize maxactive. + * maxactive - The maximum number of instances of the probed function that + * can be active concurrently. + * nmissed - tracks the number of times the probed function's return was + * ignored, due to maxactive being too low. + * + */ +struct kretprobe { + struct kprobe kp; + kretprobe_handler_t handler; + int maxactive; + int nmissed; + struct hlist_head free_instances; + struct hlist_head used_instances; +}; + +struct kretprobe_instance { + struct hlist_node uflist; /* either on free list or used list */ + struct hlist_node hlist; + struct kretprobe *rp; + void *ret_addr; + void *stack_addr; +}; + #ifdef CONFIG_KPROBES /* Locks kprobe: irq must be disabled */ void lock_kprobes(void); @@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs); /* Get the kprobe at this addr (if any). Must have called lock_kprobes */ struct kprobe *get_kprobe(void *addr); +struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); @@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p); void unregister_jprobe(struct jprobe *p); void jprobe_return(void); -#else +int register_kretprobe(struct kretprobe *rp); +void unregister_kretprobe(struct kretprobe *rp); + +struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp); +struct kretprobe_instance *get_rp_inst(void *sara); +struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk); +void add_rp_inst(struct kretprobe_instance *ri); +void kprobe_flush_task(struct task_struct *tk); +void recycle_rp_inst(struct kretprobe_instance *ri); +#else /* CONFIG_KPROBES */ static inline int kprobe_running(void) { return 0; @@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p) static inline void jprobe_return(void) { } -#endif +static inline int register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} +static inline void unregister_kretprobe(struct kretprobe *rp) +{ +} +static inline void kprobe_flush_task(struct task_struct *tk) +{ +} +#endif /* CONFIG_KPROBES */ #endif /* _LINUX_KPROBES_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 037142b72a49..692fbf75ab49 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -27,6 +27,9 @@ * interface to access function arguments. * 2004-Sep Prasanna S Panchamukhi Changed Kprobes * exceptions notifier to be first on the priority list. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. */ #include #include @@ -41,6 +44,7 @@ #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; +static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; unsigned int kprobe_cpu = NR_CPUS; static DEFINE_SPINLOCK(kprobe_lock); @@ -78,7 +82,7 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -92,8 +96,8 @@ int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -107,7 +111,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) +static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -120,6 +125,135 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) return 0; } +struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler, + .post_handler = trampoline_post_handler +}; + +struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +{ + struct hlist_node *node; + struct kretprobe_instance *ri; + hlist_for_each_entry(ri, node, &rp->free_instances, uflist) + return ri; + return NULL; +} + +static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +{ + struct hlist_node *node; + struct kretprobe_instance *ri; + hlist_for_each_entry(ri, node, &rp->used_instances, uflist) + return ri; + return NULL; +} + +struct kretprobe_instance *get_rp_inst(void *sara) +{ + struct hlist_head *head; + struct hlist_node *node; + struct task_struct *tsk; + struct kretprobe_instance *ri; + + tsk = arch_get_kprobe_task(sara); + head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; + hlist_for_each_entry(ri, node, head, hlist) { + if (ri->stack_addr == sara) + return ri; + } + return NULL; +} + +void add_rp_inst(struct kretprobe_instance *ri) +{ + struct task_struct *tsk; + /* + * Remove rp inst off the free list - + * Add it back when probed function returns + */ + hlist_del(&ri->uflist); + tsk = arch_get_kprobe_task(ri->stack_addr); + /* Add rp inst onto table */ + INIT_HLIST_NODE(&ri->hlist); + hlist_add_head(&ri->hlist, + &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]); + + /* Also add this rp inst to the used list. */ + INIT_HLIST_NODE(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->used_instances); +} + +void recycle_rp_inst(struct kretprobe_instance *ri) +{ + /* remove rp inst off the rprobe_inst_table */ + hlist_del(&ri->hlist); + if (ri->rp) { + /* remove rp inst off the used list */ + hlist_del(&ri->uflist); + /* put rp inst back onto the free list */ + INIT_HLIST_NODE(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->free_instances); + } else + /* Unregistering */ + kfree(ri); +} + +struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +{ + return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; +} + +struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk) +{ + struct task_struct *tsk; + struct hlist_head *head; + struct hlist_node *node; + struct kretprobe_instance *ri; + + head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)]; + + hlist_for_each_entry(ri, node, head, hlist) { + tsk = arch_get_kprobe_task(ri->stack_addr); + if (tsk == tk) + return ri; + } + return NULL; +} + +/* + * This function is called from do_exit or do_execv when task tk's stack is + * about to be recycled. Recycle any function-return probe instances + * associated with this task. These represent probed functions that have + * been called but may never return. + */ +void kprobe_flush_task(struct task_struct *tk) +{ + arch_kprobe_flush_task(tk, &kprobe_lock); +} + +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + + /*TODO: consider to only swap the RA after the last pre_handler fired */ + arch_prepare_kretprobe(rp, regs); + return 0; +} + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + while ((ri = get_free_rp_inst(rp)) != NULL) { + hlist_del(&ri->uflist); + kfree(ri); + } +} + /* * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe @@ -257,16 +391,82 @@ void unregister_jprobe(struct jprobe *jp) unregister_kprobe(&jp->kp); } +#ifdef ARCH_SUPPORTS_KRETPROBES + +int register_kretprobe(struct kretprobe *rp) +{ + int ret = 0; + struct kretprobe_instance *inst; + int i; + + rp->kp.pre_handler = pre_handler_kretprobe; + + /* Pre-allocate memory for max kretprobe instances */ + if (rp->maxactive <= 0) { +#ifdef CONFIG_PREEMPT + rp->maxactive = max(10, 2 * NR_CPUS); +#else + rp->maxactive = NR_CPUS; +#endif + } + INIT_HLIST_HEAD(&rp->used_instances); + INIT_HLIST_HEAD(&rp->free_instances); + for (i = 0; i < rp->maxactive; i++) { + inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); + if (inst == NULL) { + free_rp_inst(rp); + return -ENOMEM; + } + INIT_HLIST_NODE(&inst->uflist); + hlist_add_head(&inst->uflist, &rp->free_instances); + } + + rp->nmissed = 0; + /* Establish function entry probe point */ + if ((ret = register_kprobe(&rp->kp)) != 0) + free_rp_inst(rp); + return ret; +} + +#else /* ARCH_SUPPORTS_KRETPROBES */ + +int register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} + +#endif /* ARCH_SUPPORTS_KRETPROBES */ + +void unregister_kretprobe(struct kretprobe *rp) +{ + unsigned long flags; + struct kretprobe_instance *ri; + + unregister_kprobe(&rp->kp); + /* No race here */ + spin_lock_irqsave(&kprobe_lock, flags); + free_rp_inst(rp); + while ((ri = get_used_rp_inst(rp)) != NULL) { + ri->rp = NULL; + hlist_del(&ri->uflist); + } + spin_unlock_irqrestore(&kprobe_lock, flags); +} + static int __init init_kprobes(void) { int i, err = 0; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); + INIT_HLIST_HEAD(&kretprobe_inst_table[i]); + } err = register_die_notifier(&kprobe_exceptions_nb); + /* Register the trampoline probe for return probe */ + register_kprobe(&trampoline_p); return err; } @@ -277,3 +477,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); EXPORT_SYMBOL_GPL(jprobe_return); +EXPORT_SYMBOL_GPL(register_kretprobe); +EXPORT_SYMBOL_GPL(unregister_kretprobe); + From 73649dab0fd524cb8545a8cb83c6eaf77b107105 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:23 -0700 Subject: [PATCH 119/199] [PATCH] x86_64 specific function return probes The following patch adds the x86_64 architecture specific implementation for function return probes. Function return probes is a mechanism built on top of kprobes that allows a caller to register a handler to be called when a given function exits. For example, to instrument the return path of sys_mkdir: static int sys_mkdir_exit(struct kretprobe_instance *i, struct pt_regs *regs) { printk("sys_mkdir exited\n"); return 0; } static struct kretprobe return_probe = { .handler = sys_mkdir_exit, }; return_probe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("sys_mkdir"); if (register_kretprobe(&return_probe)) { printk(KERN_DEBUG "Unable to register return probe!\n"); /* do error path */ } unregister_kretprobe(&return_probe); The way this works is that: * At system initialization time, kernel/kprobes.c installs a kprobe on a function called kretprobe_trampoline() that is implemented in the arch/x86_64/kernel/kprobes.c (More on this later) * When a return probe is registered using register_kretprobe(), kernel/kprobes.c will install a kprobe on the first instruction of the targeted function with the pre handler set to arch_prepare_kretprobe() which is implemented in arch/x86_64/kernel/kprobes.c. * arch_prepare_kretprobe() will prepare a kretprobe instance that stores: - nodes for hanging this instance in an empty or free list - a pointer to the return probe - the original return address - a pointer to the stack address With all this stowed away, arch_prepare_kretprobe() then sets the return address for the targeted function to a special trampoline function called kretprobe_trampoline() implemented in arch/x86_64/kernel/kprobes.c * The kprobe completes as normal, with control passing back to the target function that executes as normal, and eventually returns to our trampoline function. * Since a kprobe was installed on kretprobe_trampoline() during system initialization, control passes back to kprobes via the architecture specific function trampoline_probe_handler() which will lookup the instance in an hlist maintained by kernel/kprobes.c, and then call the handler function. * When trampoline_probe_handler() is done, the kprobes infrastructure single steps the original instruction (in this case just a top), and then calls trampoline_post_handler(). trampoline_post_handler() then looks up the instance again, puts the instance back on the free list, and then makes a long jump back to the original return instruction. So to recap, to instrument the exit path of a function this implementation will cause four interruptions: - A breakpoint at the very beginning of the function allowing us to switch out the return address - A single step interruption to execute the original instruction that we replaced with the break instruction (normal kprobe flow) - A breakpoint in the trampoline function where our instrumented function returned to - A single step interruption to execute the original instruction that we replaced with the break instruction (normal kprobe flow) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/kprobes.c | 98 +++++++++++++++++++++++++++++++++++- arch/x86_64/kernel/process.c | 16 ++++++ include/asm-x86_64/kprobes.h | 3 ++ 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index f77f8a0ff187..203672ca7401 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -27,6 +27,8 @@ * adapted for x86_64 * 2005-Mar Roland McGrath * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Rusty Lynch + * Added function return probes functionality */ #include @@ -240,6 +242,50 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } +struct task_struct *arch_get_kprobe_task(void *ptr) +{ + return ((struct thread_info *) (((unsigned long) ptr) & + (~(THREAD_SIZE -1))))->task; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)regs->rsp; + struct kretprobe_instance *ri; + static void *orig_ret_addr; + + /* + * Save the return address when the return probe hits + * the first time, and use it to populate the (krprobe + * instance)->ret_addr for subsequent return probes at + * the same addrress since stack address would have + * the kretprobe_trampoline by then. + */ + if (((void*) *sara) != kretprobe_trampoline) + orig_ret_addr = (void*) *sara; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->stack_addr = sara; + ri->ret_addr = orig_ret_addr; + add_rp_inst(ri); + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; + } else { + rp->nmissed++; + } +} + +void arch_kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + while ((ri = get_rp_inst_tsk(tk)) != NULL) { + *((unsigned long *)(ri->stack_addr)) = + (unsigned long) ri->ret_addr; + recycle_rp_inst(ri); + } +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled thorough out this function. @@ -316,6 +362,55 @@ int kprobe_handler(struct pt_regs *regs) return ret; } +/* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct task_struct *tsk; + struct kretprobe_instance *ri; + struct hlist_head *head; + struct hlist_node *node; + unsigned long *sara = (unsigned long *)regs->rsp - 1; + + tsk = arch_get_kprobe_task(sara); + head = kretprobe_inst_table_head(tsk); + + hlist_for_each_entry(ri, node, head, hlist) { + if (ri->stack_addr == sara && ri->rp) { + if (ri->rp->handler) + ri->rp->handler(ri, regs); + } + } + return 0; +} + +void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kretprobe_instance *ri; + /* RA already popped */ + unsigned long *sara = ((unsigned long *)regs->rsp) - 1; + + while ((ri = get_rp_inst(sara))) { + regs->rip = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + } + regs->eflags &= ~TF_MASK; +} + /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "int 3" @@ -404,7 +499,8 @@ int post_kprobe_handler(struct pt_regs *regs) if (current_kprobe->post_handler) current_kprobe->post_handler(current_kprobe, regs, 0); - resume_execution(current_kprobe, regs); + if (current_kprobe->post_handler != trampoline_post_handler) + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; unlock_kprobes(); diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index dce8bab4306c..e59d1f9d6163 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -293,6 +294,14 @@ void exit_thread(void) { struct task_struct *me = current; struct thread_struct *t = &me->thread; + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(me); + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); @@ -312,6 +321,13 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h index bfea52d516f8..6d6d883fdf6d 100644 --- a/include/asm-x86_64/kprobes.h +++ b/include/asm-x86_64/kprobes.h @@ -38,6 +38,9 @@ typedef u8 kprobe_opcode_t; : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define ARCH_SUPPORTS_KRETPROBES + +void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:25 -0700 Subject: [PATCH 120/199] [PATCH] Move kprobe [dis]arming into arch specific code The architecture independent code of the current kprobes implementation is arming and disarming kprobes at registration time. The problem is that the code is assuming that arming and disarming is a just done by a simple write of some magic value to an address. This is problematic for ia64 where our instructions look more like structures, and we can not insert break points by just doing something like: *p->addr = BREAKPOINT_INSTRUCTION; The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent functions: * void arch_arm_kprobe(struct kprobe *p) * void arch_disarm_kprobe(struct kprobe *p) and then adds the new functions for each of the architectures that already implement kprobes (spar64/ppc64/i386/x86_64). I thought arch_[dis]arm_kprobe was the most descriptive of what was really happening, but each of the architectures already had a disarm_kprobe() function that was really a "disarm and do some other clean-up items as needed when you stumble across a recursive kprobe." So... I took the liberty of changing the code that was calling disarm_kprobe() to call arch_disarm_kprobe(), and then do the cleanup in the block of code dealing with the recursive kprobe case. So far this patch as been tested on i386, x86_64, and ppc64, but still needs to be tested in sparc64. Signed-off-by: Rusty Lynch Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 19 +++++++++++++++---- arch/ppc64/kernel/kprobes.c | 19 +++++++++++++++---- arch/sparc64/kernel/kprobes.c | 31 ++++++++++++++++++------------- arch/x86_64/kernel/kprobes.c | 26 ++++++++++++++++++-------- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 12 ++++-------- 6 files changed, 72 insertions(+), 37 deletions(-) diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 048f754bbe23..2314d8d306fd 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -71,16 +72,25 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_copy_kprobe(struct kprobe *p) { memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; } -void arch_remove_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +void arch_disarm_kprobe(struct kprobe *p) { *p->addr = p->opcode; - regs->eip = (unsigned long)p->addr; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ } static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -177,7 +187,8 @@ static int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - disarm_kprobe(p, regs); + arch_disarm_kprobe(p); + regs->eip = (unsigned long)p->addr; ret = 1; } else { p = current_kprobe; diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c index e950a2058a19..8c0920a6d03e 100644 --- a/arch/ppc64/kernel/kprobes.c +++ b/arch/ppc64/kernel/kprobes.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -61,16 +62,25 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_copy_kprobe(struct kprobe *p) { memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; } -void arch_remove_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +void arch_disarm_kprobe(struct kprobe *p) { *p->addr = p->opcode; - regs->nip = (unsigned long)p->addr; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ } static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -101,7 +111,8 @@ static inline int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - disarm_kprobe(p, regs); + arch_disarm_kprobe(p); + regs->nip = (unsigned long)p->addr; ret = 1; } else { p = current_kprobe; diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c index 7066d7ba667a..d67195ba3fa2 100644 --- a/arch/sparc64/kernel/kprobes.c +++ b/arch/sparc64/kernel/kprobes.c @@ -6,7 +6,6 @@ #include #include #include - #include #include @@ -47,6 +46,19 @@ void arch_copy_kprobe(struct kprobe *p) { p->ainsn.insn[0] = *p->addr; p->ainsn.insn[1] = BREAKPOINT_INSTRUCTION_2; + p->opcode = *p->addr; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flushi(p->addr); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flushi(p->addr); } void arch_remove_kprobe(struct kprobe *p) @@ -78,17 +90,6 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) } } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) -{ - *p->addr = p->opcode; - flushi(p->addr); - - regs->tpc = (unsigned long) p->addr; - regs->tnpc = current_kprobe_orig_tnpc; - regs->tstate = ((regs->tstate & ~TSTATE_PIL) | - current_kprobe_orig_tstate_pil); -} - static int kprobe_handler(struct pt_regs *regs) { struct kprobe *p; @@ -109,7 +110,11 @@ static int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - disarm_kprobe(p, regs); + arch_disarm_kprobe(p); + regs->tpc = (unsigned long) p->addr; + regs->tnpc = current_kprobe_orig_tnpc; + regs->tstate = ((regs->tstate & ~TSTATE_PIL) | + current_kprobe_orig_tstate_pil); ret = 1; } else { p = current_kprobe; diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 203672ca7401..324bf57925a9 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -39,7 +39,7 @@ #include #include #include - +#include #include #include @@ -216,19 +216,28 @@ void arch_copy_kprobe(struct kprobe *p) BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ *ripdisp = disp; } + p->opcode = *p->addr; } -void arch_remove_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { - up(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); - down(&kprobe_mutex); + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +void arch_disarm_kprobe(struct kprobe *p) { *p->addr = p->opcode; - regs->rip = (unsigned long)p->addr; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ + up(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + down(&kprobe_mutex); } static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -311,7 +320,8 @@ int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - disarm_kprobe(p, regs); + arch_disarm_kprobe(p); + regs->rip = (unsigned long)p->addr; ret = 1; } else { p = current_kprobe; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index fba39f87efec..0f90466fb8b0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -165,6 +165,8 @@ static inline int kprobe_running(void) extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_copy_kprobe(struct kprobe *p); +extern void arch_arm_kprobe(struct kprobe *p); +extern void arch_disarm_kprobe(struct kprobe *p); extern void arch_remove_kprobe(struct kprobe *p); extern void show_registers(struct pt_regs *regs); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 692fbf75ab49..e8e0ae8a6e14 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -261,7 +261,7 @@ static inline void free_rp_inst(struct kretprobe *rp) static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { ap->addr = p->addr; - ap->opcode = p->opcode; + memcpy(&ap->opcode, &p->opcode, sizeof(kprobe_opcode_t)); memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); ap->pre_handler = aggr_pre_handler; @@ -304,10 +304,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) /* kprobe removal house-keeping routines */ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) { - *p->addr = p->opcode; + arch_disarm_kprobe(p); hlist_del(&p->hlist); - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); spin_unlock_irqrestore(&kprobe_lock, flags); arch_remove_kprobe(p); } @@ -344,10 +342,8 @@ int register_kprobe(struct kprobe *p) hlist_add_head(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - p->opcode = *p->addr; - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + arch_arm_kprobe(p); + out: spin_unlock_irqrestore(&kprobe_lock, flags); rm_kprobe: From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:26 -0700 Subject: [PATCH 121/199] [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task This patch moves the lock/unlock of the arch specific kprobe_flush_task() to the non-arch specific kprobe_flusk_task(). Signed-off-by: Hien Nguyen Acked-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 5 +---- include/linux/kprobes.h | 3 +-- kernel/kprobes.c | 5 ++++- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 2314d8d306fd..b8e2bae0ab4f 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -138,17 +138,14 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) } } -void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock) +void arch_kprobe_flush_task(struct task_struct *tk) { - unsigned long flags = 0; struct kretprobe_instance *ri; - spin_lock_irqsave(kp_lock, flags); while ((ri = get_rp_inst_tsk(tk)) != NULL) { *((unsigned long *)(ri->stack_addr)) = (unsigned long) ri->ret_addr; recycle_rp_inst(ri); } - spin_unlock_irqrestore(kp_lock, flags); } /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0f90466fb8b0..461391decc46 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -33,7 +33,6 @@ #include #include #include -#include #include @@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags); extern struct task_struct *arch_get_kprobe_task(void *ptr); extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); -extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +extern void arch_kprobe_flush_task(struct task_struct *tk); #else /* ARCH_SUPPORTS_KRETPROBES */ static inline void kretprobe_trampoline(void) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e8e0ae8a6e14..dd42e717dd35 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -229,7 +229,10 @@ struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk) */ void kprobe_flush_task(struct task_struct *tk) { - arch_kprobe_flush_task(tk, &kprobe_lock); + unsigned long flags = 0; + spin_lock_irqsave(&kprobe_lock, flags); + arch_kprobe_flush_task(tk); + spin_unlock_irqrestore(&kprobe_lock, flags); } /* From 7213b2521889eb087eed8abaa48d1a692575da3e Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:27 -0700 Subject: [PATCH 122/199] [PATCH] Kprobes/IA64: kdebug die notification mechanism As many of you know that kprobes exist in the main line kernel for various architecture including i386, x86_64, ppc64 and sparc64. Attached patches following this mail are a port of Kprobes and Jprobes for IA64. I have tesed this patches for kprobes and Jprobes and this seems to work fine. I have tested this patch by inserting kprobes on various slots and various templates including various types of branch instructions. I have also tested this patch using the tool http://marc.theaimsgroup.com/?l=linux-kernel&m=111657358022586&w=2 and the kprobes for IA64 works great. Here is list of TODO things and pathes for the same will appear soon. 1) Support kprobes on "mov r1=ip" type of instruction 2) Support Kprobes and Jprobes to exist on the same address 3) Support Return probes 3) Architecture independent cleanup of kprobes This patch adds the kdebug die notification mechanism needed by Kprobes. For break instruction on Branch type slot, imm21 is ignored and value zero is placed in IIM register, hence we need to handle kprobes for switch case zero. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch From: Rusty Lynch At the point in traps.c where we recieve a break with a zero value, we can not say if the break was a result of a kprobe or some other debug facility. This simple patch changes the informational string to a more correct "break 0" value, and applies to the 2.6.12-rc2-mm2 tree with all the kprobes patches that were just recently included for the next mm cut. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/traps.c | 33 ++++++++++++++++++++- arch/ia64/mm/fault.c | 8 +++++ include/asm-ia64/break.h | 2 ++ include/asm-ia64/kdebug.h | 61 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 include/asm-ia64/kdebug.h diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 1861173bd4f6..e7e520d90f03 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -21,12 +21,26 @@ #include #include #include +#include extern spinlock_t timerlist_lock; fpswa_interface_t *fpswa_interface; EXPORT_SYMBOL(fpswa_interface); +struct notifier_block *ia64die_chain; +static DEFINE_SPINLOCK(die_notifier_lock); + +int register_die_notifier(struct notifier_block *nb) +{ + int err = 0; + unsigned long flags; + spin_lock_irqsave(&die_notifier_lock, flags); + err = notifier_chain_register(&ia64die_chain, nb); + spin_unlock_irqrestore(&die_notifier_lock, flags); + return err; +} + void __init trap_init (void) { @@ -137,6 +151,10 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs) switch (break_num) { case 0: /* unknown error (used by GCC for __builtin_abort()) */ + if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP) + == NOTIFY_STOP) { + return; + } die_if_kernel("bugcheck!", regs, break_num); sig = SIGILL; code = ILL_ILLOPC; break; @@ -189,6 +207,15 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs) sig = SIGILL; code = __ILL_BNDMOD; break; + case 0x80200: + case 0x80300: + if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP) + == NOTIFY_STOP) { + return; + } + sig = SIGTRAP; code = TRAP_BRKPT; + break; + default: if (break_num < 0x40000 || break_num > 0x100000) die_if_kernel("Bad break", regs, break_num); @@ -548,7 +575,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, #endif break; case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break; - case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break; + case 36: + if (notify_die(DIE_SS, "ss", ®s, vector, + vector, SIGTRAP) == NOTIFY_STOP) + return; + siginfo.si_code = TRAP_TRACE; ifa = 0; break; } siginfo.si_signo = SIGTRAP; siginfo.si_errno = 0; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 4174ec999dde..ff62551eb3a1 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -14,6 +14,7 @@ #include #include #include +#include extern void die (char *, struct pt_regs *, long); @@ -102,6 +103,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re goto bad_area_no_up; #endif + /* + * This is to handle the kprobes on user space access instructions + */ + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, code, TRAP_BRKPT, + SIGSEGV) == NOTIFY_STOP) + return; + down_read(&mm->mmap_sem); vma = find_vma_prev(mm, address, &prev_vma); diff --git a/include/asm-ia64/break.h b/include/asm-ia64/break.h index 97c7b2d79600..8167828edc4b 100644 --- a/include/asm-ia64/break.h +++ b/include/asm-ia64/break.h @@ -12,6 +12,8 @@ * OS-specific debug break numbers: */ #define __IA64_BREAK_KDB 0x80100 +#define __IA64_BREAK_KPROBE 0x80200 +#define __IA64_BREAK_JPROBE 0x80300 /* * OS-specific break numbers: diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h new file mode 100644 index 000000000000..4d376e1663f7 --- /dev/null +++ b/include/asm-ia64/kdebug.h @@ -0,0 +1,61 @@ +#ifndef _IA64_KDEBUG_H +#define _IA64_KDEBUG_H 1 +/* + * include/asm-ia64/kdebug.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch and Anil S Keshavamurthy + * adopted from + * include/asm-x86_64/kdebug.h + */ +#include + +struct pt_regs; + +struct die_args { + struct pt_regs *regs; + const char *str; + long err; + int trapnr; + int signr; +}; + +int register_die_notifier(struct notifier_block *nb); +extern struct notifier_block *ia64die_chain; + +enum die_val { + DIE_BREAK = 1, + DIE_SS, + DIE_PAGE_FAULT, +}; + +static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs, + long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + + return notifier_call_chain(&ia64die_chain, val, &args); +} + +#endif From fd7b231ff98578308d8f5fb76a25a369ce1074ae Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:28 -0700 Subject: [PATCH 123/199] [PATCH] Kprobes/IA64: arch specific handling This is an IA64 arch specific handling of Kprobes Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig.debug | 11 ++ arch/ia64/kernel/Makefile | 1 + arch/ia64/kernel/kprobes.c | 332 +++++++++++++++++++++++++++++++++++++ include/asm-ia64/kprobes.h | 72 ++++++++ 4 files changed, 416 insertions(+) create mode 100644 arch/ia64/kernel/kprobes.c create mode 100644 include/asm-ia64/kprobes.h diff --git a/arch/ia64/Kconfig.debug b/arch/ia64/Kconfig.debug index de9d507ba0fd..fda67ac993d7 100644 --- a/arch/ia64/Kconfig.debug +++ b/arch/ia64/Kconfig.debug @@ -2,6 +2,17 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + + choice prompt "Physical memory granularity" default IA64_GRANULE_64MB diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 4c73d8ba2e3d..5290fa4c3371 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o mca_recovery-y += mca_drv.o mca_drv_asm.o diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c new file mode 100644 index 000000000000..81a53f99a573 --- /dev/null +++ b/arch/ia64/kernel/kprobes.c @@ -0,0 +1,332 @@ +/* + * Kernel Probes (KProbes) + * arch/ia64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch and Anil S Keshavamurthy + * adapted from i386 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status; + +enum instruction_type {A, I, M, F, B, L, X, u}; +static enum instruction_type bundle_encoding[32][3] = { + { M, I, I }, /* 00 */ + { M, I, I }, /* 01 */ + { M, I, I }, /* 02 */ + { M, I, I }, /* 03 */ + { M, L, X }, /* 04 */ + { M, L, X }, /* 05 */ + { u, u, u }, /* 06 */ + { u, u, u }, /* 07 */ + { M, M, I }, /* 08 */ + { M, M, I }, /* 09 */ + { M, M, I }, /* 0A */ + { M, M, I }, /* 0B */ + { M, F, I }, /* 0C */ + { M, F, I }, /* 0D */ + { M, M, F }, /* 0E */ + { M, M, F }, /* 0F */ + { M, I, B }, /* 10 */ + { M, I, B }, /* 11 */ + { M, B, B }, /* 12 */ + { M, B, B }, /* 13 */ + { u, u, u }, /* 14 */ + { u, u, u }, /* 15 */ + { B, B, B }, /* 16 */ + { B, B, B }, /* 17 */ + { M, M, B }, /* 18 */ + { M, M, B }, /* 19 */ + { u, u, u }, /* 1A */ + { u, u, u }, /* 1B */ + { M, F, B }, /* 1C */ + { M, F, B }, /* 1D */ + { u, u, u }, /* 1E */ + { u, u, u }, /* 1F */ +}; + +int arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long) p->addr; + unsigned long bundle_addr = addr & ~0xFULL; + unsigned long slot = addr & 0xf; + bundle_t bundle; + unsigned long template; + + /* + * TODO: Verify that a probe is not being inserted + * in sensitive regions of code + * TODO: Verify that the memory holding the probe is rwx + * TODO: verify this is a kernel address + */ + memcpy(&bundle, (unsigned long *)bundle_addr, sizeof(bundle_t)); + template = bundle.quad0.template; + if (((bundle_encoding[template][1] == L) && slot > 1) || (slot > 2)) { + printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", addr); + return -EINVAL; + } + return 0; +} + +void arch_copy_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long bundle_addr = addr & ~0xFULL; + + memcpy(&p->ainsn.insn.bundle, (unsigned long *)bundle_addr, + sizeof(bundle_t)); + memcpy(&p->opcode.bundle, &p->ainsn.insn.bundle, sizeof(bundle_t)); +} + +void arch_arm_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long arm_addr = addr & ~0xFULL; + unsigned long slot = addr & 0xf; + unsigned long template; + bundle_t bundle; + + memcpy(&bundle, &p->ainsn.insn.bundle, sizeof(bundle_t)); + + template = bundle.quad0.template; + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; + switch (slot) { + case 0: + bundle.quad0.slot0 = BREAK_INST; + break; + case 1: + bundle.quad0.slot1_p0 = BREAK_INST; + bundle.quad1.slot1_p1 = (BREAK_INST >> (64-46)); + break; + case 2: + bundle.quad1.slot2 = BREAK_INST; + break; + } + + /* Flush icache for the instruction at the emulated address */ + flush_icache_range((unsigned long)&p->ainsn.insn.bundle, + (unsigned long)&p->ainsn.insn.bundle + + sizeof(bundle_t)); + /* + * Patch the original instruction with the probe instruction + * and flush the instruction cache + */ + memcpy((char *) arm_addr, (char *) &bundle, sizeof(bundle_t)); + flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long arm_addr = addr & ~0xFULL; + + /* p->opcode contains the original unaltered bundle */ + memcpy((char *) arm_addr, (char *) &p->opcode.bundle, sizeof(bundle_t)); + flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ +} + +/* + * We are resuming execution after a single step fault, so the pt_regs + * structure reflects the register state after we executed the instruction + * located in the kprobe (p->ainsn.insn.bundle). We still need to adjust + * the ip to point back to the original stack address, and if we see that + * the slot has incremented back to zero, then we need to point to the next + * slot location. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle = (unsigned long)p->addr & ~0xFULL; + + /* + * TODO: Handle cases where kprobe was inserted on a branch instruction + */ + + if (!ia64_psr(regs)->ri) + regs->cr_iip = bundle + 0x10; + else + regs->cr_iip = bundle; + + ia64_psr(regs)->ss = 0; +} + +static void prepare_ss(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle_addr = (unsigned long) &p->ainsn.insn.bundle; + unsigned long slot = (unsigned long)p->addr & 0xf; + + /* Update instruction pointer (IIP) and slot number (IPSR.ri) */ + regs->cr_iip = bundle_addr & ~0xFULL; + + if (slot > 2) + slot = 0; + + ia64_psr(regs)->ri = slot; + + /* turn on single stepping */ + ia64_psr(regs)->ss = 1; +} + +static int pre_kprobes_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs); + + preempt_disable(); + + /* Handle recursion cases */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if (kprobe_status == KPROBE_HIT_SS) { + unlock_kprobes(); + goto no_kprobe; + } + arch_disarm_kprobe(p); + ret = 1; + } else { + /* + * jprobe instrumented function just completed + */ + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* + * Our pre-handler is specifically requesting that we just + * do a return. This is handling the case where the + * pre-handler is really our special jprobe pre-handler. + */ + return 1; + +ss_probe: + prepare_ss(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static int post_kprobes_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + resume_execution(current_kprobe, regs); + + unlock_kprobes(); + preempt_enable_no_resched(); + return 1; +} + +static int kprobes_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->fault_handler && + current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + resume_execution(current_kprobe, regs); + unlock_kprobes(); + preempt_enable_no_resched(); + } + + return 0; +} + +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = (struct die_args *)data; + switch(val) { + case DIE_BREAK: + if (pre_kprobes_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_SS: + if (post_kprobes_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if (kprobes_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + default: + break; + } + return NOTIFY_DONE; +} + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + printk(KERN_WARNING "Jprobes is not supported\n"); + return 0; +} + +void jprobe_return(void) +{ +} + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + return 0; +} diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h new file mode 100644 index 000000000000..2a6f2a148890 --- /dev/null +++ b/include/asm-ia64/kprobes.h @@ -0,0 +1,72 @@ +#ifndef _ASM_KPROBES_H +#define _ASM_KPROBES_H +/* + * Kernel Probes (KProbes) + * include/asm-ia64/kprobes.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch and Anil S Keshavamurthy + * adapted from i386 + */ +#include +#include +#include + +#define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) + +typedef struct _bundle { + struct { + unsigned long long template : 5; + unsigned long long slot0 : 41; + unsigned long long slot1_p0 : 64-46; + } quad0; + struct { + unsigned long long slot1_p1 : 41 - (64-46); + unsigned long long slot2 : 41; + } quad1; +} __attribute__((__aligned__(16))) bundle_t; + +#define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry + +typedef struct kprobe_opcode { + bundle_t bundle; +} kprobe_opcode_t; + +struct fnptr { + unsigned long ip; + unsigned long gp; +}; + +/* Architecture specific copy of original instruction*/ +struct arch_specific_insn { + /* copy of the original instruction */ + kprobe_opcode_t insn; +}; + +#ifdef CONFIG_KPROBES +extern int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data); +#else /* !CONFIG_KPROBES */ +static inline int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif +#endif /* _ASM_KPROBES_H */ From b2761dc262b428475890fffd979687051beb12ba Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:28 -0700 Subject: [PATCH 124/199] [PATCH] Kprobes/IA64: architecture specific JProbes support This patch adds IA64 architecture specific JProbes support on top of Kprobes Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/Makefile | 2 +- arch/ia64/kernel/jprobes.S | 61 ++++++++++++++++++++++++++++++++++++++ arch/ia64/kernel/kprobes.c | 28 +++++++++++++---- include/asm-ia64/kprobes.h | 5 ++++ 4 files changed, 89 insertions(+), 7 deletions(-) create mode 100644 arch/ia64/kernel/jprobes.S diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 5290fa4c3371..b2e2f6509eb0 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o -obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o mca_recovery-y += mca_drv.o mca_drv_asm.o diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S new file mode 100644 index 000000000000..b7fa3ccd2b0f --- /dev/null +++ b/arch/ia64/kernel/jprobes.S @@ -0,0 +1,61 @@ +/* + * Jprobe specific operations + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Intel Corporation, 2005 + * + * 2005-May Rusty Lynch and Anil S Keshavamurthy + * initial implementation + * + * Jprobes (a.k.a. "jump probes" which is built on-top of kprobes) allow a + * probe to be inserted into the beginning of a function call. The fundamental + * difference between a jprobe and a kprobe is the jprobe handler is executed + * in the same context as the target function, while the kprobe handlers + * are executed in interrupt context. + * + * For jprobes we initially gain control by placing a break point in the + * first instruction of the targeted function. When we catch that specific + * break, we: + * * set the return address to our jprobe_inst_return() function + * * jump to the jprobe handler function + * + * Since we fixed up the return address, the jprobe handler will return to our + * jprobe_inst_return() function, giving us control again. At this point we + * are back in the parents frame marker, so we do yet another call to our + * jprobe_break() function to fix up the frame marker as it would normally + * exist in the target function. + * + * Our jprobe_return function then transfers control back to kprobes.c by + * executing a break instruction using one of our reserved numbers. When we + * catch that break in kprobes.c, we continue like we do for a normal kprobe + * by single stepping the emulated instruction, and then returning execution + * to the correct location. + */ +#include + + /* + * void jprobe_break(void) + */ +ENTRY(jprobe_break) + break.m 0x80300 +END(jprobe_break) + + /* + * void jprobe_inst_return(void) + */ +GLOBAL_ENTRY(jprobe_inst_return) + br.call.sptk.many b0=jprobe_break +END(jprobe_inst_return) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 81a53f99a573..6683a44f419f 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -35,12 +35,15 @@ #include #include +extern void jprobe_inst_return(void); + /* kprobe_status settings */ #define KPROBE_HIT_ACTIVE 0x00000001 #define KPROBE_HIT_SS 0x00000002 static struct kprobe *current_kprobe; static unsigned long kprobe_status; +static struct pt_regs jprobe_saved_regs; enum instruction_type {A, I, M, F, B, L, X, u}; static enum instruction_type bundle_encoding[32][3] = { @@ -318,15 +321,28 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { - printk(KERN_WARNING "Jprobes is not supported\n"); - return 0; -} + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr = ((struct fnptr *)(jp->entry))->ip; -void jprobe_return(void) -{ + /* save architectural state */ + jprobe_saved_regs = *regs; + + /* after rfi, execute the jprobe instrumented function */ + regs->cr_iip = addr & ~0xFULL; + ia64_psr(regs)->ri = addr & 0xf; + regs->r1 = ((struct fnptr *)(jp->entry))->gp; + + /* + * fix the return address to our jprobe_inst_return() function + * in the jprobes.S file + */ + regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; + + return 1; } int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { - return 0; + *regs = jprobe_saved_regs; + return 1; } diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index 2a6f2a148890..fec3506e53f8 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -59,6 +59,11 @@ struct arch_specific_insn { kprobe_opcode_t insn; }; +/* ia64 does not need this */ +static inline void jprobe_return(void) +{ +} + #ifdef CONFIG_KPROBES extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); From cd2675bf65455a45b54228b7acc0c6a26a164cb6 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:29 -0700 Subject: [PATCH 125/199] [PATCH] Kprobes/IA64: support kprobe on branch/call instructions This patch is required to support kprobe on branch/call instructions. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 131 ++++++++++++++++++++++++++++++++----- include/asm-ia64/kprobes.h | 17 ++++- 2 files changed, 132 insertions(+), 16 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 6683a44f419f..20a250e2e9b2 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -120,25 +120,75 @@ void arch_arm_kprobe(struct kprobe *p) unsigned long arm_addr = addr & ~0xFULL; unsigned long slot = addr & 0xf; unsigned long template; + unsigned long major_opcode = 0; + unsigned long lx_type_inst = 0; + unsigned long kprobe_inst = 0; bundle_t bundle; + p->ainsn.inst_flag = 0; + p->ainsn.target_br_reg = 0; + memcpy(&bundle, &p->ainsn.insn.bundle, sizeof(bundle_t)); + template = bundle.quad0.template; + if (slot == 1 && bundle_encoding[template][1] == L) { + lx_type_inst = 1; + slot = 2; + } + - template = bundle.quad0.template; - if (slot == 1 && bundle_encoding[template][1] == L) - slot = 2; switch (slot) { case 0: + major_opcode = (bundle.quad0.slot0 >> SLOT0_OPCODE_SHIFT); + kprobe_inst = bundle.quad0.slot0; bundle.quad0.slot0 = BREAK_INST; break; case 1: + major_opcode = (bundle.quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst = (bundle.quad0.slot1_p0 | + (bundle.quad1.slot1_p1 << (64-46))); bundle.quad0.slot1_p0 = BREAK_INST; bundle.quad1.slot1_p1 = (BREAK_INST >> (64-46)); break; case 2: + major_opcode = (bundle.quad1.slot2 >> SLOT2_OPCODE_SHIFT); + kprobe_inst = bundle.quad1.slot2; bundle.quad1.slot2 = BREAK_INST; break; } + /* + * Look for IP relative Branches, IP relative call or + * IP relative predicate instructions + */ + if (bundle_encoding[template][slot] == B) { + switch (major_opcode) { + case INDIRECT_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + case IP_RELATIVE_PREDICT_OPCODE: + case IP_RELATIVE_BRANCH_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + break; + case IP_RELATIVE_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + default: + /* Do nothing */ + break; + } + } else if (lx_type_inst) { + switch (major_opcode) { + case LONG_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + default: + /* Do nothing */ + break; + } + } /* Flush icache for the instruction at the emulated address */ flush_icache_range((unsigned long)&p->ainsn.insn.bundle, @@ -170,24 +220,75 @@ void arch_remove_kprobe(struct kprobe *p) * We are resuming execution after a single step fault, so the pt_regs * structure reflects the register state after we executed the instruction * located in the kprobe (p->ainsn.insn.bundle). We still need to adjust - * the ip to point back to the original stack address, and if we see that - * the slot has incremented back to zero, then we need to point to the next - * slot location. + * the ip to point back to the original stack address. To set the IP address + * to original stack address, handle the case where we need to fixup the + * relative IP address and/or fixup branch register. */ static void resume_execution(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle = (unsigned long)p->addr & ~0xFULL; + unsigned long bundle_addr = ((unsigned long) (&p->ainsn.insn.bundle)) & ~0xFULL; + unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; + unsigned long template; + int slot = ((unsigned long)p->addr & 0xf); - /* - * TODO: Handle cases where kprobe was inserted on a branch instruction - */ + template = p->opcode.bundle.quad0.template; + + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; + + if (p->ainsn.inst_flag) { - if (!ia64_psr(regs)->ri) - regs->cr_iip = bundle + 0x10; - else - regs->cr_iip = bundle; + if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { + /* Fix relative IP address */ + regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr; + } + + if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) { + /* + * Fix target branch register, software convention is + * to use either b0 or b6 or b7, so just checking + * only those registers + */ + switch (p->ainsn.target_br_reg) { + case 0: + if ((regs->b0 == bundle_addr) || + (regs->b0 == bundle_addr + 0x10)) { + regs->b0 = (regs->b0 - bundle_addr) + + resume_addr; + } + break; + case 6: + if ((regs->b6 == bundle_addr) || + (regs->b6 == bundle_addr + 0x10)) { + regs->b6 = (regs->b6 - bundle_addr) + + resume_addr; + } + break; + case 7: + if ((regs->b7 == bundle_addr) || + (regs->b7 == bundle_addr + 0x10)) { + regs->b7 = (regs->b7 - bundle_addr) + + resume_addr; + } + break; + } /* end switch */ + } + goto turn_ss_off; + } - ia64_psr(regs)->ss = 0; + if (slot == 2) { + if (regs->cr_iip == bundle_addr + 0x10) { + regs->cr_iip = resume_addr + 0x10; + } + } else { + if (regs->cr_iip == bundle_addr) { + regs->cr_iip = resume_addr; + } + } + +turn_ss_off: + /* Turn off Single Step bit */ + ia64_psr(regs)->ss = 0; } static void prepare_ss(struct kprobe *p, struct pt_regs *regs) diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index fec3506e53f8..d30af77a0b11 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -44,6 +44,17 @@ typedef struct _bundle { #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define SLOT0_OPCODE_SHIFT (37) +#define SLOT1_p1_OPCODE_SHIFT (37 - (64-46)) +#define SLOT2_OPCODE_SHIFT (37) + +#define INDIRECT_CALL_OPCODE (1) +#define IP_RELATIVE_CALL_OPCODE (5) +#define IP_RELATIVE_BRANCH_OPCODE (4) +#define IP_RELATIVE_PREDICT_OPCODE (7) +#define LONG_BRANCH_OPCODE (0xC) +#define LONG_CALL_OPCODE (0xD) + typedef struct kprobe_opcode { bundle_t bundle; } kprobe_opcode_t; @@ -55,8 +66,12 @@ struct fnptr { /* Architecture specific copy of original instruction*/ struct arch_specific_insn { - /* copy of the original instruction */ + /* copy of the instruction to be emulated */ kprobe_opcode_t insn; + #define INST_FLAG_FIX_RELATIVE_IP_ADDR 1 + #define INST_FLAG_FIX_BRANCH_REG 2 + unsigned long inst_flag; + unsigned short target_br_reg; }; /* ia64 does not need this */ From 8bc76772ad653bcaad1b0af72aafb6072ef0fa87 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:30 -0700 Subject: [PATCH 126/199] [PATCH] Kprobes ia64 cleanup A cleanup of the ia64 kprobes implementation such that all of the bundle manipulation logic is concentrated in arch_prepare_kprobe(). With the current design for kprobes, the arch specific code only has a chance to return failure inside the arch_prepare_kprobe() function. This patch moves all of the work that was happening in arch_copy_kprobe() and most of the work that was happening in arch_arm_kprobe() into arch_prepare_kprobe(). By doing this we can add further robustness checks in arch_arm_kprobe() and refuse to insert kprobes that will cause problems. Signed-off-by: Rusty Lynch Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 178 ++++++++++++++++--------------------- include/asm-ia64/kprobes.h | 7 ++ 2 files changed, 84 insertions(+), 101 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 20a250e2e9b2..b7a204137fbb 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -84,121 +84,97 @@ static enum instruction_type bundle_encoding[32][3] = { int arch_prepare_kprobe(struct kprobe *p) { unsigned long addr = (unsigned long) p->addr; - unsigned long bundle_addr = addr & ~0xFULL; + unsigned long *bundle_addr = (unsigned long *)(addr & ~0xFULL); unsigned long slot = addr & 0xf; - bundle_t bundle; unsigned long template; + unsigned long major_opcode = 0; + unsigned long lx_type_inst = 0; + unsigned long kprobe_inst = 0; + bundle_t *bundle = &p->ainsn.insn.bundle; - /* - * TODO: Verify that a probe is not being inserted - * in sensitive regions of code - * TODO: Verify that the memory holding the probe is rwx - * TODO: verify this is a kernel address - */ - memcpy(&bundle, (unsigned long *)bundle_addr, sizeof(bundle_t)); - template = bundle.quad0.template; - if (((bundle_encoding[template][1] == L) && slot > 1) || (slot > 2)) { - printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", addr); - return -EINVAL; - } - return 0; -} + memcpy(&p->opcode.bundle, bundle_addr, sizeof(bundle_t)); + memcpy(&p->ainsn.insn.bundle, bundle_addr, sizeof(bundle_t)); -void arch_copy_kprobe(struct kprobe *p) -{ - unsigned long addr = (unsigned long)p->addr; - unsigned long bundle_addr = addr & ~0xFULL; + p->ainsn.inst_flag = 0; + p->ainsn.target_br_reg = 0; - memcpy(&p->ainsn.insn.bundle, (unsigned long *)bundle_addr, - sizeof(bundle_t)); - memcpy(&p->opcode.bundle, &p->ainsn.insn.bundle, sizeof(bundle_t)); -} + template = bundle->quad0.template; -void arch_arm_kprobe(struct kprobe *p) -{ - unsigned long addr = (unsigned long)p->addr; - unsigned long arm_addr = addr & ~0xFULL; - unsigned long slot = addr & 0xf; - unsigned long template; - unsigned long major_opcode = 0; - unsigned long lx_type_inst = 0; - unsigned long kprobe_inst = 0; - bundle_t bundle; - - p->ainsn.inst_flag = 0; - p->ainsn.target_br_reg = 0; - - memcpy(&bundle, &p->ainsn.insn.bundle, sizeof(bundle_t)); - template = bundle.quad0.template; - if (slot == 1 && bundle_encoding[template][1] == L) { - lx_type_inst = 1; - slot = 2; - } + if (((bundle_encoding[template][1] == L) && slot > 1) || (slot > 2)) { + printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", + addr); + return -EINVAL; + } + if (slot == 1 && bundle_encoding[template][1] == L) { + lx_type_inst = 1; + slot = 2; + } switch (slot) { case 0: - major_opcode = (bundle.quad0.slot0 >> SLOT0_OPCODE_SHIFT); - kprobe_inst = bundle.quad0.slot0; - bundle.quad0.slot0 = BREAK_INST; + major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + kprobe_inst = bundle->quad0.slot0; + bundle->quad0.slot0 = BREAK_INST; break; case 1: - major_opcode = (bundle.quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); - kprobe_inst = (bundle.quad0.slot1_p0 | - (bundle.quad1.slot1_p1 << (64-46))); - bundle.quad0.slot1_p0 = BREAK_INST; - bundle.quad1.slot1_p1 = (BREAK_INST >> (64-46)); + major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst = (bundle->quad0.slot1_p0 | + (bundle->quad1.slot1_p1 << (64-46))); + bundle->quad0.slot1_p0 = BREAK_INST; + bundle->quad1.slot1_p1 = (BREAK_INST >> (64-46)); break; case 2: - major_opcode = (bundle.quad1.slot2 >> SLOT2_OPCODE_SHIFT); - kprobe_inst = bundle.quad1.slot2; - bundle.quad1.slot2 = BREAK_INST; + major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + kprobe_inst = bundle->quad1.slot2; + bundle->quad1.slot2 = BREAK_INST; break; } - /* - * Look for IP relative Branches, IP relative call or - * IP relative predicate instructions - */ - if (bundle_encoding[template][slot] == B) { - switch (major_opcode) { - case INDIRECT_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - case IP_RELATIVE_PREDICT_OPCODE: - case IP_RELATIVE_BRANCH_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - break; - case IP_RELATIVE_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - default: - /* Do nothing */ - break; - } - } else if (lx_type_inst) { - switch (major_opcode) { - case LONG_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - default: - /* Do nothing */ - break; - } - } - - /* Flush icache for the instruction at the emulated address */ - flush_icache_range((unsigned long)&p->ainsn.insn.bundle, - (unsigned long)&p->ainsn.insn.bundle + - sizeof(bundle_t)); - /* - * Patch the original instruction with the probe instruction - * and flush the instruction cache - */ - memcpy((char *) arm_addr, (char *) &bundle, sizeof(bundle_t)); + + /* + * Look for IP relative Branches, IP relative call or + * IP relative predicate instructions + */ + if (bundle_encoding[template][slot] == B) { + switch (major_opcode) { + case INDIRECT_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + case IP_RELATIVE_PREDICT_OPCODE: + case IP_RELATIVE_BRANCH_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + break; + case IP_RELATIVE_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + default: + /* Do nothing */ + break; + } + } else if (lx_type_inst) { + switch (major_opcode) { + case LONG_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + default: + /* Do nothing */ + break; + } + } + + return 0; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long arm_addr = addr & ~0xFULL; + + memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t)); flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); } @@ -226,7 +202,7 @@ void arch_remove_kprobe(struct kprobe *p) */ static void resume_execution(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle_addr = ((unsigned long) (&p->ainsn.insn.bundle)) & ~0xFULL; + unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL; unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; unsigned long template; int slot = ((unsigned long)p->addr & 0xf); @@ -293,7 +269,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs) static void prepare_ss(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle_addr = (unsigned long) &p->ainsn.insn.bundle; + unsigned long bundle_addr = (unsigned long) &p->opcode.bundle; unsigned long slot = (unsigned long)p->addr & 0xf; /* Update instruction pointer (IIP) and slot number (IPSR.ri) */ diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index d30af77a0b11..cec4d9958307 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -30,6 +30,8 @@ #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) +struct kprobe; + typedef struct _bundle { struct { unsigned long long template : 5; @@ -79,6 +81,11 @@ static inline void jprobe_return(void) { } +/* ia64 does not need this */ +static inline void arch_copy_kprobe(struct kprobe *p) +{ +} + #ifdef CONFIG_KPROBES extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); From 13608d6433eb34840224ef632cc444f3eb59bc13 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:31 -0700 Subject: [PATCH 127/199] [PATCH] Kprobes ia64 qp fix Fix a bug where a kprobe still fires when the instruction is predicated off. So given the p6=0, and we have an instruction like: (p6) move loc1=0 we should not be triggering the kprobe. This is handled by carrying over the qp section of the original instruction into the break instruction. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index b7a204137fbb..98bef04d9484 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -115,19 +115,19 @@ int arch_prepare_kprobe(struct kprobe *p) case 0: major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); kprobe_inst = bundle->quad0.slot0; - bundle->quad0.slot0 = BREAK_INST; + bundle->quad0.slot0 = BREAK_INST | (0x3f & kprobe_inst); break; case 1: major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); kprobe_inst = (bundle->quad0.slot1_p0 | (bundle->quad1.slot1_p1 << (64-46))); - bundle->quad0.slot1_p0 = BREAK_INST; + bundle->quad0.slot1_p0 = BREAK_INST | (0x3f & kprobe_inst); bundle->quad1.slot1_p1 = (BREAK_INST >> (64-46)); break; case 2: major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); kprobe_inst = bundle->quad1.slot2; - bundle->quad1.slot2 = BREAK_INST; + bundle->quad1.slot2 = BREAK_INST | (0x3f & kprobe_inst); break; } From a5403183d84d419651672f1aee5ff2273d185efa Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:32 -0700 Subject: [PATCH 128/199] [PATCH] Kprobes IA64: arch_prepare_kprobes() cleanup arch_prepare_kprobes() was doing lots of functionality in just one single function. This patch attempts to clean up arch_prepare_kprobes() by moving specific sub task to the following (new)functions 1)valid_kprobe_addr() -->> validate the given kprobe address 2)get_kprobe_inst(slot..)->> Retrives the instruction for a given slot from the bundle 3)prepare_break_inst() -->> Prepares break instruction within the bundle 3a)update_kprobe_inst_flag()-->>Updates the internal flags, required for proper emulation of the instruction at later point in time. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 191 +++++++++++++++++++++++-------------- 1 file changed, 121 insertions(+), 70 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 98bef04d9484..5d5344681555 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -81,90 +81,141 @@ static enum instruction_type bundle_encoding[32][3] = { { u, u, u }, /* 1F */ }; -int arch_prepare_kprobe(struct kprobe *p) +/* + * In this function we check to see if the instruction + * is IP relative instruction and update the kprobe + * inst flag accordingly + */ +static void update_kprobe_inst_flag(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) { - unsigned long addr = (unsigned long) p->addr; - unsigned long *bundle_addr = (unsigned long *)(addr & ~0xFULL); - unsigned long slot = addr & 0xf; - unsigned long template; - unsigned long major_opcode = 0; - unsigned long lx_type_inst = 0; - unsigned long kprobe_inst = 0; - bundle_t *bundle = &p->ainsn.insn.bundle; - - memcpy(&p->opcode.bundle, bundle_addr, sizeof(bundle_t)); - memcpy(&p->ainsn.insn.bundle, bundle_addr, sizeof(bundle_t)); - p->ainsn.inst_flag = 0; p->ainsn.target_br_reg = 0; - template = bundle->quad0.template; + if (bundle_encoding[template][slot] == B) { + switch (major_opcode) { + case INDIRECT_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + case IP_RELATIVE_PREDICT_OPCODE: + case IP_RELATIVE_BRANCH_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + break; + case IP_RELATIVE_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } else if (bundle_encoding[template][slot] == X) { + switch (major_opcode) { + case LONG_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } + return; +} - if (((bundle_encoding[template][1] == L) && slot > 1) || (slot > 2)) { - printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", - addr); - return -EINVAL; +/* + * In this function we override the bundle with + * the break instruction at the given slot. + */ +static void prepare_break_inst(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) +{ + unsigned long break_inst = BREAK_INST; + bundle_t *bundle = &p->ainsn.insn.bundle; + + /* + * Copy the original kprobe_inst qualifying predicate(qp) + * to the break instruction + */ + break_inst |= (0x3f & kprobe_inst); + + switch (slot) { + case 0: + bundle->quad0.slot0 = break_inst; + break; + case 1: + bundle->quad0.slot1_p0 = break_inst; + bundle->quad1.slot1_p1 = break_inst >> (64-46); + break; + case 2: + bundle->quad1.slot2 = break_inst; + break; } - if (slot == 1 && bundle_encoding[template][1] == L) { - lx_type_inst = 1; - slot = 2; - } + /* + * Update the instruction flag, so that we can + * emulate the instruction properly after we + * single step on original instruction + */ + update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p); +} + +static inline void get_kprobe_inst(bundle_t *bundle, uint slot, + unsigned long *kprobe_inst, uint *major_opcode) +{ + unsigned long kprobe_inst_p0, kprobe_inst_p1; + unsigned int template; + + template = bundle->quad0.template; switch (slot) { - case 0: - major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); - kprobe_inst = bundle->quad0.slot0; - bundle->quad0.slot0 = BREAK_INST | (0x3f & kprobe_inst); + case 0: + *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + *kprobe_inst = bundle->quad0.slot0; break; - case 1: - major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); - kprobe_inst = (bundle->quad0.slot1_p0 | - (bundle->quad1.slot1_p1 << (64-46))); - bundle->quad0.slot1_p0 = BREAK_INST | (0x3f & kprobe_inst); - bundle->quad1.slot1_p1 = (BREAK_INST >> (64-46)); + case 1: + *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst_p0 = bundle->quad0.slot1_p0; + kprobe_inst_p1 = bundle->quad1.slot1_p1; + *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); break; - case 2: - major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); - kprobe_inst = bundle->quad1.slot2; - bundle->quad1.slot2 = BREAK_INST | (0x3f & kprobe_inst); + case 2: + *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + *kprobe_inst = bundle->quad1.slot2; break; } +} - /* - * Look for IP relative Branches, IP relative call or - * IP relative predicate instructions - */ - if (bundle_encoding[template][slot] == B) { - switch (major_opcode) { - case INDIRECT_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - case IP_RELATIVE_PREDICT_OPCODE: - case IP_RELATIVE_BRANCH_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - break; - case IP_RELATIVE_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - default: - /* Do nothing */ - break; - } - } else if (lx_type_inst) { - switch (major_opcode) { - case LONG_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; - default: - /* Do nothing */ - break; - } +static int valid_kprobe_addr(int template, int slot, unsigned long addr) +{ + if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) { + printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", + addr); + return -EINVAL; } + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long) p->addr; + unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL); + unsigned long kprobe_inst=0; + unsigned int slot = addr & 0xf, template, major_opcode = 0; + bundle_t *bundle = &p->ainsn.insn.bundle; + + memcpy(&p->opcode.bundle, kprobe_addr, sizeof(bundle_t)); + memcpy(&p->ainsn.insn.bundle, kprobe_addr, sizeof(bundle_t)); + + template = bundle->quad0.template; + + if(valid_kprobe_addr(template, slot, addr)) + return -EINVAL; + + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; + + /* Get kprobe_inst and major_opcode from the bundle */ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + prepare_break_inst(template, slot, major_opcode, kprobe_inst, p); return 0; } @@ -260,7 +311,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs) if (regs->cr_iip == bundle_addr) { regs->cr_iip = resume_addr; } - } + } turn_ss_off: /* Turn off Single Step bit */ From 1674eafcbd3e3c68556cf19fbf4a2c30f7add729 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:33 -0700 Subject: [PATCH 129/199] [PATCH] Kprobes IA64: cmp ctype unc support The current Kprobes when patching the original instruction with the break instruction tries to retain the original qualifying predicate(qp), however for cmp.crel.ctype where ctype == unc, which is a special instruction always needs to be executed irrespective of qp. Hence, if the instruction we are patching is of this type, then we should not copy the original qp to the break instruction, this is because we always want the break fault to happen so that we can emulate the instruction. This patch is based on the feedback given by David Mosberger Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 43 ++++++++++++++++++++++++++++++++++++-- include/asm-ia64/kprobes.h | 17 +++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 5d5344681555..51bc08bd0466 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -119,6 +119,41 @@ static void update_kprobe_inst_flag(uint template, uint slot, uint major_opcode return; } +/* + * In this function we check to see if the instruction + * (qp) cmpx.crel.ctype p1,p2=r2,r3 + * on which we are inserting kprobe is cmp instruction + * with ctype as unc. + */ +static uint is_cmp_ctype_unc_inst(uint template, uint slot, uint major_opcode, +unsigned long kprobe_inst) +{ + cmp_inst_t cmp_inst; + uint ctype_unc = 0; + + if (!((bundle_encoding[template][slot] == I) || + (bundle_encoding[template][slot] == M))) + goto out; + + if (!((major_opcode == 0xC) || (major_opcode == 0xD) || + (major_opcode == 0xE))) + goto out; + + cmp_inst.l = kprobe_inst; + if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) { + /* Integere compare - Register Register (A6 type)*/ + if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0) + &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) { + /* Integere compare - Immediate Register (A8 type)*/ + if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } +out: + return ctype_unc; +} + /* * In this function we override the bundle with * the break instruction at the given slot. @@ -131,9 +166,13 @@ static void prepare_break_inst(uint template, uint slot, uint major_opcode, /* * Copy the original kprobe_inst qualifying predicate(qp) - * to the break instruction + * to the break instruction iff !is_cmp_ctype_unc_inst + * because for cmp instruction with ctype equal to unc, + * which is a special instruction always needs to be + * executed regradless of qp */ - break_inst |= (0x3f & kprobe_inst); + if (!is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) + break_inst |= (0x3f & kprobe_inst); switch (slot) { case 0: diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index cec4d9958307..7b700035e36d 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -30,6 +30,23 @@ #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) +typedef union cmp_inst { + struct { + unsigned long long qp : 6; + unsigned long long p1 : 6; + unsigned long long c : 1; + unsigned long long r2 : 7; + unsigned long long r3 : 7; + unsigned long long p2 : 6; + unsigned long long ta : 1; + unsigned long long x2 : 2; + unsigned long long tb : 1; + unsigned long long opcode : 4; + unsigned long long reserved : 23; + }f; + unsigned long long l; +} cmp_inst_t; + struct kprobe; typedef struct _bundle { From 708de8f11c2901cc49fd7725baf4a0fbd7264e73 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:34 -0700 Subject: [PATCH 130/199] [PATCH] Kprobes IA64: safe register kprobe The current kprobes does not yet handle register kprobes on some of the following kind of instruction which needs to be emulated in a special way. 1) mov r1=ip 2) chk -- Speculation check instruction This patch attempts to fail register_kprobes() when user tries to insert kprobes on the above kind of instruction. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 51bc08bd0466..027d656664d2 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -119,6 +119,48 @@ static void update_kprobe_inst_flag(uint template, uint slot, uint major_opcode return; } +/* + * In this function we check to see if the instruction + * on which we are inserting kprobe is supported. + * Returns 0 if supported + * Returns -EINVAL if unsupported + */ +static int unsupported_inst(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + + if (bundle_encoding[template][slot] == I) { + switch (major_opcode) { + case 0x0: //I_UNIT_MISC_OPCODE: + /* + * Check for Integer speculation instruction + * - Bit 33-35 to be equal to 0x1 + */ + if (((kprobe_inst >> 33) & 0x7) == 1) { + printk(KERN_WARNING + "Kprobes on speculation inst at <0x%lx> not supported\n", + addr); + return -EINVAL; + } + + /* + * IP relative mov instruction + * - Bit 27-35 to be equal to 0x30 + */ + if (((kprobe_inst >> 27) & 0x1FF) == 0x30) { + printk(KERN_WARNING + "Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n", + addr); + return -EINVAL; + + } + } + } + return 0; +} + + /* * In this function we check to see if the instruction * (qp) cmpx.crel.ctype p1,p2=r2,r3 @@ -254,6 +296,9 @@ int arch_prepare_kprobe(struct kprobe *p) /* Get kprobe_inst and major_opcode from the bundle */ get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + if (unsupported_inst(template, slot, major_opcode, kprobe_inst, p)) + return -EINVAL; + prepare_break_inst(template, slot, major_opcode, kprobe_inst, p); return 0; From 89cb14c0dd0e4a7d0315d19f449389c4d49237ee Mon Sep 17 00:00:00 2001 From: Keshavamurthy Anil S Date: Thu, 23 Jun 2005 00:09:35 -0700 Subject: [PATCH 131/199] [PATCH] Kprobes/IA64: check jprobe break before handling Once the jprobe instrumented function returns, it executes a jprobe_break which is a break instruction with __IA64_JPROBE_BREAK value. The current patch checks for this break value, before assuming that jprobe instrumented function just completed. The previous code was not checking for this value and that was a bug. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 027d656664d2..41e80b42d3f3 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -419,10 +419,11 @@ static void prepare_ss(struct kprobe *p, struct pt_regs *regs) ia64_psr(regs)->ss = 1; } -static int pre_kprobes_handler(struct pt_regs *regs) +static int pre_kprobes_handler(struct die_args *args) { struct kprobe *p; int ret = 0; + struct pt_regs *regs = args->regs; kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs); preempt_disable(); @@ -437,7 +438,7 @@ static int pre_kprobes_handler(struct pt_regs *regs) } arch_disarm_kprobe(p); ret = 1; - } else { + } else if (args->err == __IA64_BREAK_JPROBE) { /* * jprobe instrumented function just completed */ @@ -445,6 +446,9 @@ static int pre_kprobes_handler(struct pt_regs *regs) if (p->break_handler && p->break_handler(p, regs)) { goto ss_probe; } + } else { + /* Not our break */ + goto no_kprobe; } } @@ -515,7 +519,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, struct die_args *args = (struct die_args *)data; switch(val) { case DIE_BREAK: - if (pre_kprobes_handler(args->regs)) + if (pre_kprobes_handler(args)) return NOTIFY_STOP; break; case DIE_SS: From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:36 -0700 Subject: [PATCH 132/199] [PATCH] kprobes: Temporary disarming of reentrant probe In situations where a kprobes handler calls a routine which has a probe on it, then kprobes_handler() disarms the new probe forever. This patch removes the above limitation by temporarily disarming the new probe. When the another probe hits while handling the old probe, the kprobes_handler() saves previous kprobes state and handles the new probe without calling the new kprobes registered handlers. kprobe_post_handler() restores back the previous kprobes state and the normal execution continues. However on x86_64 architecture, re-rentrancy is provided only through pre_handler(). If a routine having probe is referenced through post_handler(), then the probes on that routine are disarmed forever, since the exception stack is gets changed after the processor single steps the instruction of the new probe. This patch includes generic changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 9 +++++++++ kernel/kprobes.c | 1 + 2 files changed, 10 insertions(+) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 461391decc46..5e1a7b0d7b3f 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -36,6 +36,12 @@ #include +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 +#define KPROBE_REENTER 0x00000004 +#define KPROBE_HIT_SSDONE 0x00000008 + struct kprobe; struct pt_regs; struct kretprobe; @@ -55,6 +61,9 @@ struct kprobe { /* list of kprobes for multi-handler support */ struct list_head list; + /*count the number of times this probe was temporarily disarmed */ + unsigned long nmissed; + /* location of the probe point */ kprobe_opcode_t *addr; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index dd42e717dd35..456ecedff2d4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -335,6 +335,7 @@ int register_kprobe(struct kprobe *p) } spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); + p->nmissed = 0; if (old_p) { ret = register_aggr_kprobe(old_p, p); goto out; From 417c8da6511b54843e6b557d94d8112d80e32156 Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:37 -0700 Subject: [PATCH 133/199] [PATCH] kprobes: Temporary disarming of reentrant probe for i386 This patch includes i386 architecture specific changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 62 ++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index b8e2bae0ab4f..3762f6b35ab2 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -37,12 +37,10 @@ #include #include -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_old_eflags_prev, kprobe_saved_eflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_esp; /* copy of the kernel stack at the probe fire time */ @@ -93,6 +91,31 @@ void arch_remove_kprobe(struct kprobe *p) { } +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_old_eflags_prev = kprobe_old_eflags; + kprobe_saved_eflags_prev = kprobe_saved_eflags; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_old_eflags = kprobe_old_eflags_prev; + kprobe_saved_eflags = kprobe_saved_eflags_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; +} + static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { regs->eflags |= TF_MASK; @@ -184,9 +207,18 @@ static int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - arch_disarm_kprobe(p); - regs->eip = (unsigned long)p->addr; - ret = 1; + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -221,11 +253,7 @@ static int kprobe_handler(struct pt_regs *regs) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; - kprobe_saved_eflags = kprobe_old_eflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->opcode)) - kprobe_saved_eflags &= ~IF_MASK; + set_current_kprobe(p, regs); if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -370,14 +398,22 @@ static inline int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } if (current_kprobe->post_handler != trampoline_post_handler) resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } unlock_kprobes(); +out: preempt_enable_no_resched(); /* From aa3d7e3d782bbcf567b1da6101d8fbff7cf7e7a6 Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:37 -0700 Subject: [PATCH 134/199] [PATCH] kprobes: Temporary disarming of reentrant probe for x86_64 This patch includes x86_64 architecture specific changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/kprobes.c | 76 +++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 324bf57925a9..4e680f87a75f 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -45,12 +45,10 @@ static DECLARE_MUTEX(kprobe_mutex); -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; static kprobe_opcode_t *get_insn_slot(void); @@ -240,6 +238,31 @@ void arch_remove_kprobe(struct kprobe *p) down(&kprobe_mutex); } +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_old_rflags_prev = kprobe_old_rflags; + kprobe_saved_rflags_prev = kprobe_saved_rflags; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_old_rflags = kprobe_old_rflags_prev; + kprobe_saved_rflags = kprobe_saved_rflags_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + current_kprobe = p; + kprobe_saved_rflags = kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kprobe_saved_rflags &= ~IF_MASK; +} + static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { regs->eflags |= TF_MASK; @@ -319,10 +342,30 @@ int kprobe_handler(struct pt_regs *regs) regs->eflags |= kprobe_saved_rflags; unlock_kprobes(); goto no_kprobe; + } else if (kprobe_status == KPROBE_HIT_SSDONE) { + /* TODO: Provide re-entrancy from + * post_kprobes_handler() and avoid exception + * stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->rip = (unsigned long)p->addr; + ret = 1; + } else { + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the + * handler. We here save the original kprobe + * variables and just single step on instruction + * of the new probe without calling any user + * handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - ret = 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -352,11 +395,7 @@ int kprobe_handler(struct pt_regs *regs) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; - kprobe_saved_rflags = kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kprobe_saved_rflags &= ~IF_MASK; + set_current_kprobe(p, regs); if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -506,14 +545,23 @@ int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } if (current_kprobe->post_handler != trampoline_post_handler) resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; - unlock_kprobes(); + /* Restore the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } else { + unlock_kprobes(); + } +out: preempt_enable_no_resched(); /* From 42cc20600a3450efcf1f724fa3891ffbff4fcb2b Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:38 -0700 Subject: [PATCH 135/199] [PATCH] kprobes: Temporary disarming of reentrant probe for ppc64 This patch includes ppc64 architecture specific changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/kprobes.c | 46 ++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c index 8c0920a6d03e..782ce3efa2c1 100644 --- a/arch/ppc64/kernel/kprobes.c +++ b/arch/ppc64/kernel/kprobes.c @@ -36,12 +36,10 @@ #include #include -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_saved_msr; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_saved_msr_prev; static struct pt_regs jprobe_saved_regs; int arch_prepare_kprobe(struct kprobe *p) @@ -93,6 +91,20 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->nip = (unsigned long)&p->ainsn.insn; } +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_saved_msr_prev = kprobe_saved_msr; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_saved_msr = kprobe_saved_msr_prev; +} + static inline int kprobe_handler(struct pt_regs *regs) { struct kprobe *p; @@ -111,9 +123,19 @@ static inline int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - arch_disarm_kprobe(p); - regs->nip = (unsigned long)p->addr; - ret = 1; + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + current_kprobe = p; + kprobe_saved_msr = regs->msr; + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -195,13 +217,21 @@ static inline int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } resume_execution(current_kprobe, regs); regs->msr |= kprobe_saved_msr; + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } unlock_kprobes(); +out: preempt_enable_no_resched(); /* From e539c2331414e73a5a1b79fb57369d79447c1cf8 Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:39 -0700 Subject: [PATCH 136/199] [PATCH] kprobes: Temporary disarming of reentrant probe for sparc64 This patch includes sparc64 architecture specific changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc64/kernel/kprobes.c | 62 +++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c index d67195ba3fa2..bdac631cf011 100644 --- a/arch/sparc64/kernel/kprobes.c +++ b/arch/sparc64/kernel/kprobes.c @@ -65,19 +65,40 @@ void arch_remove_kprobe(struct kprobe *p) { } -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long current_kprobe_orig_tnpc; static unsigned long current_kprobe_orig_tstate_pil; static unsigned int kprobe_status; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_orig_tnpc_prev; +static unsigned long kprobe_orig_tstate_pil_prev; +static unsigned int kprobe_status_prev; -static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +static inline void save_previous_kprobe(void) +{ + kprobe_status_prev = kprobe_status; + kprobe_orig_tnpc_prev = current_kprobe_orig_tnpc; + kprobe_orig_tstate_pil_prev = current_kprobe_orig_tstate_pil; + kprobe_prev = current_kprobe; +} + +static inline void restore_previous_kprobe(void) +{ + kprobe_status = kprobe_status_prev; + current_kprobe_orig_tnpc = kprobe_orig_tnpc_prev; + current_kprobe_orig_tstate_pil = kprobe_orig_tstate_pil_prev; + current_kprobe = kprobe_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) { current_kprobe_orig_tnpc = regs->tnpc; current_kprobe_orig_tstate_pil = (regs->tstate & TSTATE_PIL); + current_kprobe = p; +} + +static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ regs->tstate |= TSTATE_PIL; /*single step inline, if it a breakpoint instruction*/ @@ -110,12 +131,18 @@ static int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - arch_disarm_kprobe(p); - regs->tpc = (unsigned long) p->addr; - regs->tnpc = current_kprobe_orig_tnpc; - regs->tstate = ((regs->tstate & ~TSTATE_PIL) | - current_kprobe_orig_tstate_pil); - ret = 1; + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + kprobe_status = KPROBE_REENTER; + prepare_singlestep(p, regs); + return 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) @@ -143,8 +170,8 @@ static int kprobe_handler(struct pt_regs *regs) goto no_kprobe; } + set_current_kprobe(p, regs); kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; if (p->pre_handler && p->pre_handler(p, regs)) return 1; @@ -250,12 +277,20 @@ static inline int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } resume_execution(current_kprobe, regs); + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } unlock_kprobes(); +out: preempt_enable_no_resched(); return 1; @@ -397,3 +432,4 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + From 852caccc89d3883522e87a91bfa89fd9c9cfe15a Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:40 -0700 Subject: [PATCH 137/199] [PATCH] Kprobes/ia64: temporary disarming of reentrant probe This patch includes IA64 architecture specific changes(ported form i386) to support temporary disarming on reentrancy of probes. In case of reentrancy we single step without calling user handler. Signed-of-by: Anil S Keshavamurth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 49 +++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 41e80b42d3f3..5978823d5c63 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -41,8 +41,8 @@ extern void jprobe_inst_return(void); #define KPROBE_HIT_ACTIVE 0x00000001 #define KPROBE_HIT_SS 0x00000002 -static struct kprobe *current_kprobe; -static unsigned long kprobe_status; +static struct kprobe *current_kprobe, *kprobe_prev; +static unsigned long kprobe_status, kprobe_status_prev; static struct pt_regs jprobe_saved_regs; enum instruction_type {A, I, M, F, B, L, X, u}; @@ -273,6 +273,23 @@ static int valid_kprobe_addr(int template, int slot, unsigned long addr) return 0; } +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; +} + +static inline void set_current_kprobe(struct kprobe *p) +{ + current_kprobe = p; +} + int arch_prepare_kprobe(struct kprobe *p) { unsigned long addr = (unsigned long) p->addr; @@ -436,8 +453,18 @@ static int pre_kprobes_handler(struct die_args *args) unlock_kprobes(); goto no_kprobe; } - arch_disarm_kprobe(p); - ret = 1; + /* We have reentered the pre_kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p); + p->nmissed++; + prepare_ss(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } else if (args->err == __IA64_BREAK_JPROBE) { /* * jprobe instrumented function just completed @@ -460,7 +487,7 @@ static int pre_kprobes_handler(struct die_args *args) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; + set_current_kprobe(p); if (p->pre_handler && p->pre_handler(p, regs)) /* @@ -485,12 +512,22 @@ static int post_kprobes_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } resume_execution(current_kprobe, regs); + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } + unlock_kprobes(); + +out: preempt_enable_no_resched(); return 1; } From 8b0914ea7475615c7c8965c1ac8fe4069270f25c Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:41 -0700 Subject: [PATCH 138/199] [PATCH] jprobes: allow a jprobe to coexist with muliple kprobes Presently either multiple kprobes or only one jprobe could be inserted. This patch removes the above limitation and allows one jprobe and multiple kprobes to coexist at the same address. However multiple jprobes cannot coexist with multiple kprobes. Currently I am working on the prototype to allow multiple jprobes coexist with multiple kprobes. Signed-off-by: Ananth N Mavinakayanhalli Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 61 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 456ecedff2d4..334f37472c56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -89,9 +89,10 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) list_for_each_entry(kp, &p->list, list) { if (kp->pre_handler) { curr_kprobe = kp; - kp->pre_handler(kp, regs); - curr_kprobe = NULL; + if (kp->pre_handler(kp, regs)) + return 1; } + curr_kprobe = NULL; } return 0; } @@ -125,6 +126,19 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, return 0; } +static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp = curr_kprobe; + if (curr_kprobe && kp->break_handler) { + if (kp->break_handler(kp, regs)) { + curr_kprobe = NULL; + return 1; + } + } + curr_kprobe = NULL; + return 0; +} + struct kprobe trampoline_p = { .addr = (kprobe_opcode_t *) &kretprobe_trampoline, .pre_handler = trampoline_probe_handler, @@ -257,19 +271,46 @@ static inline void free_rp_inst(struct kretprobe *rp) } } +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); +} + +/* +* Add the new probe to old_p->list. Fail if this is the +* second jprobe at the address - two jprobes can't coexist +*/ +static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + struct kprobe *kp; + + if (p->break_handler) { + list_for_each_entry(kp, &old_p->list, list) { + if (kp->break_handler) + return -EEXIST; + } + list_add_tail(&p->list, &old_p->list); + } else + list_add(&p->list, &old_p->list); + return 0; +} + /* * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { + copy_kprobe(p, ap); ap->addr = p->addr; - memcpy(&ap->opcode, &p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); - ap->pre_handler = aggr_pre_handler; ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add(&p->list, &ap->list); @@ -290,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) int ret = 0; struct kprobe *ap; - if (old_p->break_handler || p->break_handler) { - ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ - } else if (old_p->pre_handler == aggr_pre_handler) { - list_add(&p->list, &old_p->list); + if (old_p->pre_handler == aggr_pre_handler) { + copy_kprobe(old_p, p); + ret = add_new_kprobe(old_p, p); } else { ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); if (!ap) return -ENOMEM; add_aggr_kprobe(ap, old_p); - list_add(&p->list, &ap->list); + copy_kprobe(ap, p); + ret = add_new_kprobe(ap, p); } return ret; } From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 23 Jun 2005 00:09:43 -0700 Subject: [PATCH 139/199] [PATCH] setuid core dump Add a new `suid_dumpable' sysctl: This value can be used to query and set the core dump mode for setuid or otherwise protected/tainted binaries. The modes are 0 - (default) - traditional behaviour. Any process which has changed privilege levels or is execute only will not be dumped 1 - (debug) - all processes dump core when possible. The core dump is owned by the current user and no security is applied. This is intended for system debugging situations only. Ptrace is unchecked. 2 - (suidsafe) - any binary which normally would not be dumped is dumped readable by root only. This allows the end user to remove such a dump but not access it directly. For security reasons core dumps in this mode will not overwrite one another or other files. This mode is appropriate when adminstrators are attempting to debug problems in a normal environment. (akpm: > > +EXPORT_SYMBOL(suid_dumpable); > > EXPORT_SYMBOL_GPL? No problem to me. > > if (current->euid == current->uid && current->egid == current->gid) > > current->mm->dumpable = 1; > > Should this be SUID_DUMP_USER? Actually the feedback I had from last time was that the SUID_ defines should go because its clearer to follow the numbers. They can go everywhere (and there are lots of places where dumpable is tested/used as a bool in untouched code) > Maybe this should be renamed to `dump_policy' or something. Doing that > would help us catch any code which isn't using the #defines, too. Fair comment. The patch was designed to be easy to maintain for Red Hat rather than for merging. Changing that field would create a gigantic diff because it is used all over the place. ) Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 20 ++++++++++++++++++++ fs/exec.c | 23 +++++++++++++++++++++-- fs/proc/base.c | 6 ++++-- include/linux/binfmts.h | 5 +++++ include/linux/sched.h | 2 +- include/linux/sysctl.h | 1 + kernel/sys.c | 22 +++++++++++----------- kernel/sysctl.c | 9 +++++++++ security/commoncap.c | 2 +- security/dummy.c | 2 +- 10 files changed, 74 insertions(+), 18 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 35159176997b..9f11d36a8c10 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -49,6 +49,7 @@ show up in /proc/sys/kernel: - shmmax [ sysv ipc ] - shmmni - stop-a [ SPARC only ] +- suid_dumpable - sysrq ==> Documentation/sysrq.txt - tainted - threads-max @@ -300,6 +301,25 @@ kernel. This value defaults to SHMMAX. ============================================================== +suid_dumpable: + +This value can be used to query and set the core dump mode for setuid +or otherwise protected/tainted binaries. The modes are + +0 - (default) - traditional behaviour. Any process which has changed + privilege levels or is execute only will not be dumped +1 - (debug) - all processes dump core when possible. The core dump is + owned by the current user and no security is applied. This is + intended for system debugging situations only. Ptrace is unchecked. +2 - (suidsafe) - any binary which normally would not be dumped is dumped + readable by root only. This allows the end user to remove + such a dump but not access it directly. For security reasons + core dumps in this mode will not overwrite one another or + other files. This mode is appropriate when adminstrators are + attempting to debug problems in a normal environment. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/fs/exec.c b/fs/exec.c index 3a4b35a14c0d..48871917d363 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -58,6 +58,9 @@ int core_uses_pid; char core_pattern[65] = "core"; +int suid_dumpable = 0; + +EXPORT_SYMBOL(suid_dumpable); /* The maximal length of core_pattern is also specified in sysctl.c */ static struct linux_binfmt *formats; @@ -864,6 +867,9 @@ int flush_old_exec(struct linux_binprm * bprm) if (current->euid == current->uid && current->egid == current->gid) current->mm->dumpable = 1; + else + current->mm->dumpable = suid_dumpable; + name = bprm->filename; /* Copies the binary name from after last slash */ @@ -884,7 +890,7 @@ int flush_old_exec(struct linux_binprm * bprm) permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { suid_keys(current); - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; } /* An exec changes our domain. We are no longer part of the thread @@ -1432,6 +1438,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) struct inode * inode; struct file * file; int retval = 0; + int fsuid = current->fsuid; + int flag = 0; binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) @@ -1441,6 +1449,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) up_write(&mm->mmap_sem); goto fail; } + + /* + * We cannot trust fsuid as being the "true" uid of the + * process nor do we know its entire history. We only know it + * was tainted so we dump it as root in mode 2. + */ + if (mm->dumpable == 2) { /* Setuid core dump mode */ + flag = O_EXCL; /* Stop rewrite attacks */ + current->fsuid = 0; /* Dump root private */ + } mm->dumpable = 0; init_completion(&mm->core_done); spin_lock_irq(¤t->sighand->siglock); @@ -1466,7 +1484,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) lock_kernel(); format_corename(corename, core_pattern, signr); unlock_kernel(); - file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600); + file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(file)) goto fail_unlock; inode = file->f_dentry->d_inode; @@ -1491,6 +1509,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) close_fail: filp_close(file, NULL); fail_unlock: + current->fsuid = fsuid; complete_all(&mm->core_done); fail: return retval; diff --git a/fs/proc/base.c b/fs/proc/base.c index e31903aadd96..ace151fa4878 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -314,7 +314,7 @@ static int may_ptrace_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) goto out; rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE)) goto out; if (security_ptrace(current, task)) goto out; @@ -1113,7 +1113,9 @@ static int task_dumpable(struct task_struct *task) if (mm) dumpable = mm->dumpable; task_unlock(task); - return dumpable; + if(dumpable == 1) + return 1; + return 0; } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7e736e201c46..c1e82c514443 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); +extern int suid_dumpable; +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + /* Stack area protections */ #define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */ #define EXSTACK_DISABLE_X 1 /* Disable executable stacks */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b58afd97a180..901742f92389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -246,7 +246,7 @@ struct mm_struct { unsigned long saved_auxv[42]; /* for /proc/PID/auxv */ - unsigned dumpable:1; + unsigned dumpable:2; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a17745c80a91..614e939c78a4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -136,6 +136,7 @@ enum KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ KERN_RANDOMIZE=68, /* int: randomize virtual address space */ + KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ }; diff --git a/kernel/sys.c b/kernel/sys.c index f006632c2ba7..0a2c8cda9638 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -525,7 +525,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (new_egid != old_egid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } if (rgid != (gid_t) -1 || @@ -556,7 +556,7 @@ asmlinkage long sys_setgid(gid_t gid) { if(old_egid != gid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; @@ -565,7 +565,7 @@ asmlinkage long sys_setgid(gid_t gid) { if(old_egid != gid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->egid = current->fsgid = gid; @@ -596,7 +596,7 @@ static int set_user(uid_t new_ruid, int dumpclear) if(dumpclear) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->uid = new_ruid; @@ -653,7 +653,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_euid != old_euid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = current->euid = new_euid; @@ -703,7 +703,7 @@ asmlinkage long sys_setuid(uid_t uid) if (old_euid != uid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = current->euid = uid; @@ -748,7 +748,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (euid != (uid_t) -1) { if (euid != current->euid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->euid = euid; @@ -798,7 +798,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if (egid != (gid_t) -1) { if (egid != current->egid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->egid = egid; @@ -845,7 +845,7 @@ asmlinkage long sys_setfsuid(uid_t uid) { if (uid != old_fsuid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = uid; @@ -875,7 +875,7 @@ asmlinkage long sys_setfsgid(gid_t gid) { if (gid != old_fsgid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsgid = gid; @@ -1652,7 +1652,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = 1; break; case PR_SET_DUMPABLE: - if (arg2 != 0 && arg2 != 1) { + if (arg2 < 0 || arg2 > 2) { error = -EINVAL; break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 701d12c63068..24a4d12d5aa9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; +extern int suid_dumpable; extern char core_pattern[]; extern int cad_pid; extern int pid_max; @@ -950,6 +951,14 @@ static ctl_table fs_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = KERN_SETUID_DUMPABLE, + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/security/commoncap.c b/security/commoncap.c index 849b8c338ee8..04c12f58d656 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -149,7 +149,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || !cap_issubset (new_permitted, current->cap_permitted)) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) { if (!capable(CAP_SETUID)) { diff --git a/security/dummy.c b/security/dummy.c index b32eff146547..6ff887586479 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -130,7 +130,7 @@ static void dummy_bprm_free_security (struct linux_binprm *bprm) static void dummy_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) { if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; if ((unsafe & ~LSM_UNSAFE_PTRACE_CAP) && !capable(CAP_SETUID)) { bprm->e_uid = current->uid; From acfa1823d33859b0db77701726c9ca5ccc6e6f25 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Thu, 23 Jun 2005 00:09:45 -0700 Subject: [PATCH 140/199] [PATCH] Support for dx directories in ext3_get_parent (NFSD) Henrik Grubbstrom noted: The 2.6.10 ext3_get_parent attempts to use ext3_find_entry to look up the entry "..", which fails for dx directories since ".." is not present in the directory hash table. The patch below solves this by looking up the dotdot entry in the dx_root block. Typical symptoms of the above bug are intermittent claims by nfsd that files or directories are missing on exported ext3 filesystems. cf https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=3D150759 and https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=3D144556 ext3_get_parent() is IMHO the wrong place to fix this bug as it introduces a lot of internals from htree into that function. Instead, I think this should be fixed in ext3_find_entry() as in the below patch. This has the added advantage that it works for any callers of ext3_find_entry() and not just ext3_lookup_parent(). Signed-off-by: Andreas Dilger Signed-off-by: Henrik Grubbstrom Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 79742d824a0a..60e44e6dd7a6 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -932,8 +932,16 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; sb = dir->i_sb; - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) - return NULL; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ + if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ + } hash = hinfo.hash; do { block = dx_get_block(frame->at); From 70f09f1fdf38cd7fca39913978d18cf998ab2c80 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:09:47 -0700 Subject: [PATCH 141/199] [PATCH] Document the fact that linux-arm-kernel is subscribers-only. "Non-members are not allowed to post messages to this list. Blame the original poster for cross-posting to subscriber-only mailing lists. " Signed-off-by: Alexey Dobriyan Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4d44824884ef..fc4b0e8bca6b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -304,7 +304,7 @@ S: Maintained ARM/PT DIGITAL BOARD PORT P: Stefan Eletzhofer M: stefan.eletzhofer@eletztrick.de -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) W: http://www.arm.linux.org.uk/ S: Maintained @@ -317,21 +317,21 @@ S: Maintained ARM/STRONGARM110 PORT P: Russell King M: rmk@arm.linux.org.uk -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) W: http://www.arm.linux.org.uk/ S: Maintained ARM/S3C2410 ARM ARCHITECTURE P: Ben Dooks M: ben-s3c2410@fluff.org -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) W: http://www.fluff.org/ben/linux/ S: Maintained ARM/S3C2440 ARM ARCHITECTURE P: Ben Dooks M: ben-s3c2440@fluff.org -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) W: http://www.fluff.org/ben/linux/ S: Maintained @@ -1853,7 +1853,7 @@ S: Maintained PXA2xx SUPPORT P: Nicolas Pitre M: nico@cam.org -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) S: Maintained QLOGIC QLA2XXX FC-SCSI DRIVER @@ -2155,7 +2155,7 @@ SHARP LH SUPPORT (LH7952X & LH7A40X) P: Marc Singer M: elf@buici.com W: http://projects.buici.com/arm -L: linux-arm-kernel@lists.arm.linux.org.uk +L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) S: Maintained SPARC (sparc32): From c663e5d80ebec426916ad2aa5400c7ec99aa572e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:49 -0700 Subject: [PATCH 142/199] [PATCH] add some comments to lookup_create() In a duplicate of lookup_create in the af_unix code Al commented what's going on nicely, so let's bring that over to lookup_create before the copy is going away (I'll send a patch soon) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a7f7f44119b3..fa8df81ce8ca 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1577,19 +1577,35 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) * * Simple function to lookup and return a dentry and create it * if it doesn't exist. Is SMP-safe. + * + * Returns with nd->dentry->d_inode->i_sem locked. */ struct dentry *lookup_create(struct nameidata *nd, int is_dir) { - struct dentry *dentry; + struct dentry *dentry = ERR_PTR(-EEXIST); down(&nd->dentry->d_inode->i_sem); - dentry = ERR_PTR(-EEXIST); + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; + + /* + * Do the final lookup. + */ dentry = lookup_hash(&nd->last, nd->dentry); if (IS_ERR(dentry)) goto fail; + + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) goto enoent; return dentry; From af4d2ecbf007b7df3db7a41eedccdc05b8006d0b Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 23 Jun 2005 00:09:50 -0700 Subject: [PATCH 143/199] [PATCH] Fix of bogus file max limit messages This patch fixes incorrect and bogus kernel messages that file-max limit reached when the allocation fails Signed-Off-By: Kirill Korotaev Signed-Off-By: Denis Lunev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 57 ++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 03d83cb686b1..fa7849fae134 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -63,42 +63,45 @@ static inline void file_free(struct file *f) */ struct file *get_empty_filp(void) { -static int old_max; + static int old_max; struct file * f; /* * Privileged users can go above max_files */ - if (files_stat.nr_files < files_stat.max_files || - capable(CAP_SYS_ADMIN)) { - f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); - if (f) { - memset(f, 0, sizeof(*f)); - if (security_file_alloc(f)) { - file_free(f); - goto fail; - } - eventpoll_init_file(f); - atomic_set(&f->f_count, 1); - f->f_uid = current->fsuid; - f->f_gid = current->fsgid; - rwlock_init(&f->f_owner.lock); - /* f->f_version: 0 */ - INIT_LIST_HEAD(&f->f_list); - f->f_maxcount = INT_MAX; - return f; - } - } - + if (files_stat.nr_files >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) + goto over; + + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + if (f == NULL) + goto fail; + + memset(f, 0, sizeof(*f)); + if (security_file_alloc(f)) + goto fail_sec; + + eventpoll_init_file(f); + atomic_set(&f->f_count, 1); + f->f_uid = current->fsuid; + f->f_gid = current->fsgid; + rwlock_init(&f->f_owner.lock); + /* f->f_version: 0 */ + INIT_LIST_HEAD(&f->f_list); + f->f_maxcount = INT_MAX; + return f; + +over: /* Ran out of filps - report that */ - if (files_stat.max_files >= old_max) { + if (files_stat.nr_files > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); - old_max = files_stat.max_files; - } else { - /* Big problems... */ - printk(KERN_WARNING "VFS: filp allocation failed\n"); + old_max = files_stat.nr_files; } + goto fail; + +fail_sec: + file_free(f); fail: return NULL; } From 4fea2838aa00b9e59efde974dcdb455608192811 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 23 Jun 2005 00:09:51 -0700 Subject: [PATCH 144/199] [PATCH] Software suspend and recalc sigpending bug fix This patch fixes recalc_sigpending() to work correctly with tasks which are being freezed. The problem is that freeze_processes() sets PF_FREEZE and TIF_SIGPENDING flags on tasks, but recalc_sigpending() called from e.g. sys_rt_sigtimedwait or any other kernel place will clear TIF_SIGPENDING due to no pending signals queued and the tasks won't be freezed until it recieves a real signal or freezed_processes() fail due to timeout. Signed-Off-By: Kirill Korotaev Signed-Off-By: Alexey Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/signal.c b/kernel/signal.c index c89821b69ae3..d1258729a5f9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) fastcall void recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || + (t->flags & PF_FREEZE) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) set_tsk_thread_flag(t, TIF_SIGPENDING); From 618f06362ae3f60f95d7b0e666de25ee6ae35679 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 23 Jun 2005 00:09:54 -0700 Subject: [PATCH 145/199] [PATCH] O(1) sb list traversing on syncs This patch removes O(n^2) super block loops in sync_inodes(), sync_filesystems() etc. in favour of using __put_super_and_need_restart() which I introduced earlier. We faced a noticably long freezes on sb syncing when there are thousands of super blocks in the system. Signed-Off-By: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 64 +++++++++++++++--------------------- fs/quota.c | 60 ++++++++++++++-------------------- fs/super.c | 83 +++++++++++++++++++++++++---------------------- 3 files changed, 96 insertions(+), 111 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e050fa58218..e94ab398b717 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -485,32 +485,6 @@ static void set_sb_syncing(int val) spin_unlock(&sb_lock); } -/* - * Find a superblock with inodes that need to be synced - */ -static struct super_block *get_super_to_sync(void) -{ - struct super_block *sb; -restart: - spin_lock(&sb_lock); - sb = sb_entry(super_blocks.prev); - for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (!sb->s_root) { - drop_super(sb); - goto restart; - } - return sb; - } - spin_unlock(&sb_lock); - return NULL; -} - /** * sync_inodes - writes all inodes to disk * @wait: wait for completion @@ -530,23 +504,39 @@ static struct super_block *get_super_to_sync(void) * outstanding dirty inodes, the writeback goes block-at-a-time within the * filesystem's write_inode(). This is extremely slow. */ -void sync_inodes(int wait) +static void __sync_inodes(int wait) { struct super_block *sb; - set_sb_syncing(0); - while ((sb = get_super_to_sync()) != NULL) { - sync_inodes_sb(sb, 0); - sync_blockdev(sb->s_bdev); - drop_super(sb); + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_syncing) + continue; + sb->s_syncing = 1; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) { + sync_inodes_sb(sb, wait); + sync_blockdev(sb->s_bdev); + } + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } + spin_unlock(&sb_lock); +} + +void sync_inodes(int wait) +{ + set_sb_syncing(0); + __sync_inodes(0); + if (wait) { set_sb_syncing(0); - while ((sb = get_super_to_sync()) != NULL) { - sync_inodes_sb(sb, 1); - sync_blockdev(sb->s_bdev); - drop_super(sb); - } + __sync_inodes(1); } } diff --git a/fs/quota.c b/fs/quota.c index 3f0333a51a23..f5d1cff55196 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -149,36 +149,6 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t return error; } -static struct super_block *get_super_to_sync(int type) -{ - struct list_head *head; - int cnt, dirty; - -restart: - spin_lock(&sb_lock); - list_for_each(head, &super_blocks) { - struct super_block *sb = list_entry(head, struct super_block, s_list); - - /* This test just improves performance so it needn't be reliable... */ - for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) - if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_any_dirty(&sb_dqopt(sb)->info[cnt])) - dirty = 1; - if (!dirty) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (!sb->s_root) { - drop_super(sb); - goto restart; - } - return sb; - } - spin_unlock(&sb_lock); - return NULL; -} - static void quota_sync_sb(struct super_block *sb, int type) { int cnt; @@ -219,17 +189,35 @@ static void quota_sync_sb(struct super_block *sb, int type) void sync_dquots(struct super_block *sb, int type) { + int cnt, dirty; + if (sb) { if (sb->s_qcop->quota_sync) quota_sync_sb(sb, type); + return; } - else { - while ((sb = get_super_to_sync(type)) != NULL) { - if (sb->s_qcop->quota_sync) - quota_sync_sb(sb, type); - drop_super(sb); - } + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + /* This test just improves performance so it needn't be reliable... */ + for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) + if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) + && info_any_dirty(&sb_dqopt(sb)->info[cnt])) + dirty = 1; + if (!dirty) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } + spin_unlock(&sb_lock); } /* Copy parameters and call proper function */ diff --git a/fs/super.c b/fs/super.c index 573bcc81bb82..25bc1ec6bc5d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -341,20 +341,22 @@ static inline void write_super(struct super_block *sb) */ void sync_supers(void) { - struct super_block * sb; -restart: + struct super_block *sb; + spin_lock(&sb_lock); - sb = sb_entry(super_blocks.next); - while (sb != sb_entry(&super_blocks)) +restart: + list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); write_super(sb); - drop_super(sb); - goto restart; - } else - sb = sb_entry(sb->s_list.next); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + } spin_unlock(&sb_lock); } @@ -381,20 +383,16 @@ void sync_filesystems(int wait) down(&mutex); /* Could be down_interruptible */ spin_lock(&sb_lock); - for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { + list_for_each_entry(sb, &super_blocks, s_list) { if (!sb->s_op->sync_fs) continue; if (sb->s_flags & MS_RDONLY) continue; sb->s_need_sync_fs = 1; } - spin_unlock(&sb_lock); restart: - spin_lock(&sb_lock); - for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { + list_for_each_entry(sb, &super_blocks, s_list) { if (!sb->s_need_sync_fs) continue; sb->s_need_sync_fs = 0; @@ -405,8 +403,11 @@ void sync_filesystems(int wait) down_read(&sb->s_umount); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); - drop_super(sb); - goto restart; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } spin_unlock(&sb_lock); up(&mutex); @@ -422,21 +423,25 @@ void sync_filesystems(int wait) struct super_block * get_super(struct block_device *bdev) { - struct list_head *p; + struct super_block *sb; + if (!bdev) return NULL; -rescan: + spin_lock(&sb_lock); - list_for_each(p, &super_blocks) { - struct super_block *s = sb_entry(p); - if (s->s_bdev == bdev) { - s->s_count++; +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdev == bdev) { + sb->s_count++; spin_unlock(&sb_lock); - down_read(&s->s_umount); - if (s->s_root) - return s; - drop_super(s); - goto rescan; + down_read(&sb->s_umount); + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto rescan; } } spin_unlock(&sb_lock); @@ -447,20 +452,22 @@ EXPORT_SYMBOL(get_super); struct super_block * user_get_super(dev_t dev) { - struct list_head *p; + struct super_block *sb; -rescan: spin_lock(&sb_lock); - list_for_each(p, &super_blocks) { - struct super_block *s = sb_entry(p); - if (s->s_dev == dev) { - s->s_count++; +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_dev == dev) { + sb->s_count++; spin_unlock(&sb_lock); - down_read(&s->s_umount); - if (s->s_root) - return s; - drop_super(s); - goto rescan; + down_read(&sb->s_umount); + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto rescan; } } spin_unlock(&sb_lock); From 328007b70c8e99c62eef5bc310d8a21d0e937342 Mon Sep 17 00:00:00 2001 From: Pat Gefre Date: Thu, 23 Jun 2005 00:09:54 -0700 Subject: [PATCH 146/199] [PATCH] Altix: shut off xmit intr if done xmitting Small mod to shut off the xmit interrupt if we have nothing to transmit. Signed-off-by: Patrick Gefre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/sn_console.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c index fee6418e84c4..840815fde49b 100644 --- a/drivers/serial/sn_console.c +++ b/drivers/serial/sn_console.c @@ -572,6 +572,7 @@ static void sn_transmit_chars(struct sn_cons_port *port, int raw) if (uart_circ_empty(xmit) || uart_tx_stopped(&port->sc_port)) { /* Nothing to do. */ + ia64_sn_console_intr_disable(SAL_CONSOLE_INTR_XMIT); return; } From 44e58a6a0bd604f46be9d808408a1cd880cc9b19 Mon Sep 17 00:00:00 2001 From: Martin Schitter Date: Thu, 23 Jun 2005 00:09:55 -0700 Subject: [PATCH 147/199] [PATCH] parport: NetMos nm9855 fix kernel 2.6.12-rc2 adopted some code by Bjorn Helgaas supporting NetMos combo controller cards. this implementation doesn't work for nm9855 based cards! there are two reasons: a) the module 'parport_pc' doesn't want to give the resonsibility for the netmos_9855 to 'parport_serial' and can not handle the serial lines -- trivial to fix... http://lists.infradead.org/pipermail/linux-parport/2005-February/000250.html http://lkml.org/lkml/2005/3/24/199 b) the support for the nm9855 in 'parport_serial' still doesn't work because of wrong assumptions about the relevant BARs port address layout for this chip: 0000:00:09.0 Communication controller: NetMos Technology PCI 9855 Multi-I/O Controller (rev 01) (= 9710:9855) Subsystem: LSI Logic / Symbios Logic 1P4S (= 1000:0014) Flags: medium devsel, IRQ 177 I/O ports at a800 [size=8] (= parport) I/O ports at a400 [size=8] I/O ports at a000 [size=8] (= serial) I/O ports at 9800 [size=8] (= serial) I/O ports at 9400 [size=8] (= serial) I/O ports at 9000 [size=16] (= serial) the following patch will fix the problem. Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/parport/parport_pc.c | 4 ---- drivers/parport/parport_serial.c | 5 ++++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c index e7f3bcb79000..80edfa3abd29 100644 --- a/drivers/parport/parport_pc.c +++ b/drivers/parport/parport_pc.c @@ -2751,7 +2751,6 @@ enum parport_pc_pci_cards { netmos_9755, netmos_9805, netmos_9815, - netmos_9855, }; @@ -2826,7 +2825,6 @@ static struct parport_pc_pci { /* netmos_9755 */ { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */ /* netmos_9805 */ { 1, { { 0, -1 }, } }, /* untested */ /* netmos_9815 */ { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */ - /* netmos_9855 */ { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */ }; static struct pci_device_id parport_pc_pci_tbl[] = { @@ -2907,8 +2905,6 @@ static struct pci_device_id parport_pc_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9805 }, { PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9815, PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9815 }, - { PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9855, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9855 }, { 0, } /* terminate list */ }; MODULE_DEVICE_TABLE(pci,parport_pc_pci_tbl); diff --git a/drivers/parport/parport_serial.c b/drivers/parport/parport_serial.c index 6715a17b5d0f..00498e2f1205 100644 --- a/drivers/parport/parport_serial.c +++ b/drivers/parport/parport_serial.c @@ -34,6 +34,7 @@ enum parport_pc_pci_cards { titan_110l = 0, titan_210l, netmos_9xx5_combo, + netmos_9855, avlab_1s1p, avlab_1s1p_650, avlab_1s1p_850, @@ -87,6 +88,7 @@ static struct parport_pc_pci cards[] __devinitdata = { /* titan_110l */ { 1, { { 3, -1 }, } }, /* titan_210l */ { 1, { { 3, -1 }, } }, /* netmos_9xx5_combo */ { 1, { { 2, -1 }, }, netmos_parallel_init }, + /* netmos_9855 */ { 1, { { 0, -1 }, }, netmos_parallel_init }, /* avlab_1s1p */ { 1, { { 1, 2}, } }, /* avlab_1s1p_650 */ { 1, { { 1, 2}, } }, /* avlab_1s1p_850 */ { 1, { { 1, 2}, } }, @@ -120,7 +122,7 @@ static struct pci_device_id parport_serial_pci_tbl[] = { { PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9845, PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9xx5_combo }, { PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9855, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9xx5_combo }, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9855 }, /* PCI_VENDOR_ID_AVLAB/Intek21 has another bunch of cards ...*/ { 0x14db, 0x2110, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1s1p}, { 0x14db, 0x2111, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1s1p_650}, @@ -207,6 +209,7 @@ static struct pci_board_no_ids pci_boards[] __devinitdata = { /* titan_110l */ { SPCI_FL_BASE1 | SPCI_FL_BASE_TABLE, 1, 921600 }, /* titan_210l */ { SPCI_FL_BASE1 | SPCI_FL_BASE_TABLE, 2, 921600 }, /* netmos_9xx5_combo */ { SPCI_FL_BASE0 | SPCI_FL_BASE_TABLE, 1, 115200, 0, 0, netmos_serial_init }, +/* netmos_9855 */ { SPCI_FL_BASE2 | SPCI_FL_BASE_TABLE, 1, 115200, 0, 0, netmos_serial_init }, /* avlab_1s1p (n/t) */ { SPCI_FL_BASE0 | SPCI_FL_BASE_TABLE, 1, 115200 }, /* avlab_1s1p_650 (nt)*/{ SPCI_FL_BASE0 | SPCI_FL_BASE_TABLE, 1, 115200 }, /* avlab_1s1p_850 (nt)*/{ SPCI_FL_BASE0 | SPCI_FL_BASE_TABLE, 1, 115200 }, From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001 From: Yoav Zach Date: Thu, 23 Jun 2005 00:09:58 -0700 Subject: [PATCH 148/199] [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64 In ia64 kernel, the O_LARGEFILE flag is forced when opening a file. This is problematic for execution of 32 bit processes, which are not largefile aware, either by SW emulation or by HW execution. For such processes, the problem is two-fold: 1) When trying to open a file that is larger than 4G the operation should fail, but it's not 2) Writing to offset larger than 4G should fail, but it's not The proposed patch takes advantage of the way 32 bit processes are identified in ia64 systems. Such processes have PER_LINUX32 for their personality. With the patch, the ia64 kernel will not enforce the O_LARGEFILE flag if the current process has PER_LINUX32 set. The behavior for all other architectures remains unchanged. Signed-off-by: Yoav Zach Acked-by: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 7 ++++--- include/asm-ia64/fcntl.h | 2 ++ include/linux/fcntl.h | 4 ++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/open.c b/fs/open.c index 963bd81a44c8..2ebb72c1a876 100644 --- a/fs/open.c +++ b/fs/open.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -935,9 +936,9 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode) char * tmp; int fd, error; -#if BITS_PER_LONG != 32 - flags |= O_LARGEFILE; -#endif + if (force_o_largefile()) + flags |= O_LARGEFILE; + tmp = getname(filename); fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { diff --git a/include/asm-ia64/fcntl.h b/include/asm-ia64/fcntl.h index d193981bb1d8..c9f8d835d0cc 100644 --- a/include/asm-ia64/fcntl.h +++ b/include/asm-ia64/fcntl.h @@ -81,4 +81,6 @@ struct flock { #define F_LINUX_SPECIFIC_BASE 1024 +#define force_o_largefile() ( ! (current->personality & PER_LINUX32) ) + #endif /* _ASM_IA64_FCNTL_H */ diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 704fb76b6334..8a7c82151de9 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -25,6 +25,10 @@ #ifdef __KERNEL__ +#ifndef force_o_largefile +#define force_o_largefile() (BITS_PER_LONG != 32) +#endif + #if BITS_PER_LONG == 32 #define IS_GETLK32(cmd) ((cmd) == F_GETLK) #define IS_SETLK32(cmd) ((cmd) == F_SETLK) From c7ea4b31fd962b4baadb42c0b8d7c6851c584102 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:09:59 -0700 Subject: [PATCH 149/199] [PATCH] ide-floppy adjustments Fix a build problem when IDEFLOPPY_DEBUG_BUGS is turned off, and eliminate an access to memory that is no longer allocated (causing systems to fail booting when CONFIG_DEBUG_PAGEALLOC is turned on). Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/ide/ide-floppy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index c949e98df4b6..9eab6426148e 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -661,10 +661,12 @@ static void idefloppy_output_buffers (ide_drive_t *drive, idefloppy_pc_t *pc, un idefloppy_do_end_request(drive, 1, done >> 9); +#if IDEFLOPPY_DEBUG_BUGS if (bcount) { printk(KERN_ERR "%s: leftover data in idefloppy_output_buffers, bcount == %d\n", drive->name, bcount); idefloppy_write_zeros(drive, bcount); } +#endif } static void idefloppy_update_buffers (ide_drive_t *drive, idefloppy_pc_t *pc) @@ -1048,6 +1050,9 @@ static ide_startstop_t idefloppy_issue_pc (ide_drive_t *drive, idefloppy_pc_t *p atapi_bcount_t bcount; ide_handler_t *pkt_xfer_routine; +#if 0 /* Accessing floppy->pc is not valid here, the previous pc may be gone + and have lived on another thread's stack; that stack may have become + unmapped meanwhile (CONFIG_DEBUG_PAGEALLOC). */ #if IDEFLOPPY_DEBUG_BUGS if (floppy->pc->c[0] == IDEFLOPPY_REQUEST_SENSE_CMD && pc->c[0] == IDEFLOPPY_REQUEST_SENSE_CMD) { @@ -1055,6 +1060,7 @@ static ide_startstop_t idefloppy_issue_pc (ide_drive_t *drive, idefloppy_pc_t *p "Two request sense in serial were issued\n"); } #endif /* IDEFLOPPY_DEBUG_BUGS */ +#endif if (floppy->failed_pc == NULL && pc->c[0] != IDEFLOPPY_REQUEST_SENSE_CMD) From 11c80c8367db0a9d342529ed74464670cd86a1f6 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:09:59 -0700 Subject: [PATCH 150/199] [PATCH] adjust per_cpu definition in non-SMP case Fix (in the architectures I'm actually building for) the UP definition of per_cpu so that the cpu specified may be any expression, not just an identifier or a suffix expression. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/percpu.h | 2 +- include/asm-ia64/percpu.h | 2 +- include/asm-x86_64/percpu.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 3b709b84934f..9044aeb37828 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -29,7 +29,7 @@ do { \ #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h index 1e87f19dad56..2b14dee29ce7 100644 --- a/include/asm-ia64/percpu.h +++ b/include/asm-ia64/percpu.h @@ -50,7 +50,7 @@ extern void *per_cpu_init(void); #else /* ! SMP */ -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 415d73f3c8ef..9c71855736fb 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -39,7 +39,7 @@ extern void setup_per_cpu_areas(void); #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #endif /* SMP */ From 8476994af7bd9352ecdf61ba760f7397f54e30a1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:10:00 -0700 Subject: [PATCH 151/199] [PATCH] apply quotation handling to Makefile.build Adding quotation handling to rule_cc_o_c in scripts/Makefile.build as used elsewhere. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/Makefile.build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 352d531ee3c1..76ba6be3dfc9 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -176,10 +176,10 @@ endif define rule_cc_o_c $(if $($(quiet)cmd_checksrc),echo ' $($(quiet)cmd_checksrc)';) \ $(cmd_checksrc) \ - $(if $($(quiet)cmd_cc_o_c),echo ' $($(quiet)cmd_cc_o_c)';) \ + $(if $($(quiet)cmd_cc_o_c),echo ' $(subst ','\'',$($(quiet)cmd_cc_o_c))';) \ $(cmd_cc_o_c); \ $(cmd_modversions) \ - scripts/basic/fixdep $(depfile) $@ '$(cmd_cc_o_c)' > $(@D)/.$(@F).tmp; \ + scripts/basic/fixdep $(depfile) $@ '$(subst ','\'',$(cmd_cc_o_c))' > $(@D)/.$(@F).tmp; \ rm -f $(depfile); \ mv -f $(@D)/.$(@F).tmp $(@D)/.$(@F).cmd endef From 01890a4c120f68366441bf5e193d1b9dd543d4d0 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 23 Jun 2005 00:10:01 -0700 Subject: [PATCH 152/199] [PATCH] mempool - only init waitqueue in slow path Here's a small patch to improve the performance of mempool_alloc by only initializing the wait queue when we're about to wait. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index 920c8c3ab1b8..9a72f7d918fa 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -204,7 +204,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) { void *element; unsigned long flags; - DEFINE_WAIT(wait); + wait_queue_t wait; int gfp_temp; might_sleep_if(gfp_mask & __GFP_WAIT); @@ -235,6 +235,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) /* Now start performing page reclaim */ gfp_temp = gfp_mask; + init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); smp_mb(); if (!pool->curr_nr) From dfb388bf8a328f206bba33933dd97230f412238b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:10:02 -0700 Subject: [PATCH 153/199] [PATCH] factor out common code in sys_fsync/sys_fdatasync This patch consolidates sys_fsync and sys_fdatasync. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 0befa724ab98..12bdb2791127 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -331,7 +331,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) return ret; } -asmlinkage long sys_fsync(unsigned int fd) +static long do_fsync(unsigned int fd, int datasync) { struct file * file; struct address_space *mapping; @@ -342,14 +342,14 @@ asmlinkage long sys_fsync(unsigned int fd) if (!file) goto out; - mapping = file->f_mapping; - ret = -EINVAL; if (!file->f_op || !file->f_op->fsync) { /* Why? We can still call filemap_fdatawrite */ goto out_putf; } + mapping = file->f_mapping; + current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(mapping); @@ -358,7 +358,7 @@ asmlinkage long sys_fsync(unsigned int fd) * which could cause livelocks in fsync_buffers_list */ down(&mapping->host->i_sem); - err = file->f_op->fsync(file, file->f_dentry, 0); + err = file->f_op->fsync(file, file->f_dentry, datasync); if (!ret) ret = err; up(&mapping->host->i_sem); @@ -373,39 +373,14 @@ asmlinkage long sys_fsync(unsigned int fd) return ret; } -asmlinkage long sys_fdatasync(unsigned int fd) +asmlinkage long sys_fsync(unsigned int fd) { - struct file * file; - struct address_space *mapping; - int ret, err; - - ret = -EBADF; - file = fget(fd); - if (!file) - goto out; - - ret = -EINVAL; - if (!file->f_op || !file->f_op->fsync) - goto out_putf; - - mapping = file->f_mapping; - - current->flags |= PF_SYNCWRITE; - ret = filemap_fdatawrite(mapping); - down(&mapping->host->i_sem); - err = file->f_op->fsync(file, file->f_dentry, 1); - if (!ret) - ret = err; - up(&mapping->host->i_sem); - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - current->flags &= ~PF_SYNCWRITE; + return do_fsync(fd, 0); +} -out_putf: - fput(file); -out: - return ret; +asmlinkage long sys_fdatasync(unsigned int fd) +{ + return do_fsync(fd, 1); } /* From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001 From: Peter Osterlund Date: Thu, 23 Jun 2005 00:10:02 -0700 Subject: [PATCH 154/199] [PATCH] Improve CD/DVD packet driver write performance This patch improves write performance for the CD/DVD packet writing driver. The logic for switching between reading and writing has been changed so that streaming writes are no longer interrupted by read requests. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 36 ++++++++++++++++++++---------------- include/linux/pktcdvd.h | 2 +- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index bc56770bcc90..7f3d78de265c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -467,14 +467,12 @@ static int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsign * Queue a bio for processing by the low-level CD device. Must be called * from process context. */ -static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_prio_read) +static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) { spin_lock(&pd->iosched.lock); if (bio_data_dir(bio) == READ) { pkt_add_list_last(bio, &pd->iosched.read_queue, &pd->iosched.read_queue_tail); - if (high_prio_read) - pd->iosched.high_prio_read = 1; } else { pkt_add_list_last(bio, &pd->iosched.write_queue, &pd->iosched.write_queue_tail); @@ -490,15 +488,16 @@ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_p * requirements for CDRW drives: * - A cache flush command must be inserted before a read request if the * previous request was a write. - * - Switching between reading and writing is slow, so don't it more often + * - Switching between reading and writing is slow, so don't do it more often * than necessary. + * - Optimize for throughput at the expense of latency. This means that streaming + * writes will never be interrupted by a read, but if the drive has to seek + * before the next write, switch to reading instead if there are any pending + * read requests. * - Set the read speed according to current usage pattern. When only reading * from the device, it's best to use the highest possible read speed, but * when switching often between reading and writing, it's better to have the * same read and write speeds. - * - Reads originating from user space should have higher priority than reads - * originating from pkt_gather_data, because some process is usually waiting - * on reads of the first kind. */ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) { @@ -512,21 +511,24 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) for (;;) { struct bio *bio; - int reads_queued, writes_queued, high_prio_read; + int reads_queued, writes_queued; spin_lock(&pd->iosched.lock); reads_queued = (pd->iosched.read_queue != NULL); writes_queued = (pd->iosched.write_queue != NULL); - if (!reads_queued) - pd->iosched.high_prio_read = 0; - high_prio_read = pd->iosched.high_prio_read; spin_unlock(&pd->iosched.lock); if (!reads_queued && !writes_queued) break; if (pd->iosched.writing) { - if (high_prio_read || (!writes_queued && reads_queued)) { + int need_write_seek = 1; + spin_lock(&pd->iosched.lock); + bio = pd->iosched.write_queue; + spin_unlock(&pd->iosched.lock); + if (bio && (bio->bi_sector == pd->iosched.last_write)) + need_write_seek = 0; + if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { VPRINTK("pktcdvd: write, waiting\n"); break; @@ -559,8 +561,10 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) if (bio_data_dir(bio) == READ) pd->iosched.successive_reads += bio->bi_size >> 10; - else + else { pd->iosched.successive_reads = 0; + pd->iosched.last_write = bio->bi_sector + bio_sectors(bio); + } if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { if (pd->read_speed == pd->write_speed) { pd->read_speed = MAX_SPEED; @@ -765,7 +769,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) atomic_inc(&pkt->io_wait); bio->bi_rw = READ; - pkt_queue_bio(pd, bio, 0); + pkt_queue_bio(pd, bio); frames_read++; } @@ -1062,7 +1066,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) atomic_set(&pkt->io_wait, 1); pkt->w_bio->bi_rw = WRITE; - pkt_queue_bio(pd, pkt->w_bio, 0); + pkt_queue_bio(pd, pkt->w_bio); } static void pkt_finish_packet(struct packet_data *pkt, int uptodate) @@ -2120,7 +2124,7 @@ static int pkt_make_request(request_queue_t *q, struct bio *bio) cloned_bio->bi_private = psd; cloned_bio->bi_end_io = pkt_end_io_read_cloned; pd->stats.secs_r += bio->bi_size >> 9; - pkt_queue_bio(pd, cloned_bio, 1); + pkt_queue_bio(pd, cloned_bio); return 0; } diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 4e2d2a942ecb..4b32bce9a289 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -159,7 +159,7 @@ struct packet_iosched struct bio *read_queue_tail; struct bio *write_queue; struct bio *write_queue_tail; - int high_prio_read; /* An important read request has been queued */ + sector_t last_write; /* The sector where the last write ended */ int successive_reads; }; From b030a4dd609e167da7f73c2d1fa5af864a0aea17 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 23 Jun 2005 00:10:03 -0700 Subject: [PATCH 155/199] [PATCH] Remove eventpoll macro obfuscation This patch gets rid of some macro obfuscation from fs/eventpoll.c by removing slab allocator wrappers and converting macros to static inline functions. Signed-off-by: Pekka Enberg Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 195 ++++++++++++++++++++++++++++--------------------- 1 file changed, 110 insertions(+), 85 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9900e333655a..6ab1dd0ca904 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -101,57 +101,6 @@ /* Maximum number of poll wake up nests we are allowing */ #define EP_MAX_POLLWAKE_NESTS 4 -/* Macro to allocate a "struct epitem" from the slab cache */ -#define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL) - -/* Macro to free a "struct epitem" to the slab cache */ -#define EPI_MEM_FREE(p) kmem_cache_free(epi_cache, p) - -/* Macro to allocate a "struct eppoll_entry" from the slab cache */ -#define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL) - -/* Macro to free a "struct eppoll_entry" to the slab cache */ -#define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p) - -/* Fast test to see if the file is an evenpoll file */ -#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) - -/* Setup the structure that is used as key for the rb-tree */ -#define EP_SET_FFD(p, f, d) do { (p)->file = (f); (p)->fd = (d); } while (0) - -/* Compare rb-tree keys */ -#define EP_CMP_FFD(p1, p2) ((p1)->file > (p2)->file ? +1: \ - ((p1)->file < (p2)->file ? -1: (p1)->fd - (p2)->fd)) - -/* Special initialization for the rb-tree node to detect linkage */ -#define EP_RB_INITNODE(n) (n)->rb_parent = (n) - -/* Removes a node from the rb-tree and marks it for a fast is-linked check */ -#define EP_RB_ERASE(n, r) do { rb_erase(n, r); (n)->rb_parent = (n); } while (0) - -/* Fast check to verify that the item is linked to the main rb-tree */ -#define EP_RB_LINKED(n) ((n)->rb_parent != (n)) - -/* - * Remove the item from the list and perform its initialization. - * This is useful for us because we can test if the item is linked - * using "EP_IS_LINKED(p)". - */ -#define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0) - -/* Tells us if the item is currently linked */ -#define EP_IS_LINKED(p) (!list_empty(p)) - -/* Get the "struct epitem" from a wait queue pointer */ -#define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base) - -/* Get the "struct epitem" from an epoll queue wrapper */ -#define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi) - -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -#define EP_OP_HASH_EVENT(op) ((op) != EPOLL_CTL_DEL) - - struct epoll_filefd { struct file *file; int fd; @@ -357,6 +306,82 @@ static struct dentry_operations eventpollfs_dentry_operations = { +/* Fast test to see if the file is an evenpoll file */ +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} + +/* Setup the structure that is used as key for the rb-tree */ +static inline void ep_set_ffd(struct epoll_filefd *ffd, + struct file *file, int fd) +{ + ffd->file = file; + ffd->fd = fd; +} + +/* Compare rb-tree keys */ +static inline int ep_cmp_ffd(struct epoll_filefd *p1, + struct epoll_filefd *p2) +{ + return (p1->file > p2->file ? +1: + (p1->file < p2->file ? -1 : p1->fd - p2->fd)); +} + +/* Special initialization for the rb-tree node to detect linkage */ +static inline void ep_rb_initnode(struct rb_node *n) +{ + n->rb_parent = n; +} + +/* Removes a node from the rb-tree and marks it for a fast is-linked check */ +static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r) +{ + rb_erase(n, r); + n->rb_parent = n; +} + +/* Fast check to verify that the item is linked to the main rb-tree */ +static inline int ep_rb_linked(struct rb_node *n) +{ + return n->rb_parent != n; +} + +/* + * Remove the item from the list and perform its initialization. + * This is useful for us because we can test if the item is linked + * using "ep_is_linked(p)". + */ +static inline void ep_list_del(struct list_head *p) +{ + list_del(p); + INIT_LIST_HEAD(p); +} + +/* Tells us if the item is currently linked */ +static inline int ep_is_linked(struct list_head *p) +{ + return !list_empty(p); +} + +/* Get the "struct epitem" from a wait queue pointer */ +static inline struct epitem * ep_item_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait)->base; +} + +/* Get the "struct epitem" from an epoll queue wrapper */ +static inline struct epitem * ep_item_from_epqueue(poll_table *p) +{ + return container_of(p, struct ep_pqueue, pt)->epi; +} + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_hash_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + /* Initialize the poll safe wake up structure */ static void ep_poll_safewake_init(struct poll_safewake *psw) { @@ -456,7 +481,7 @@ void eventpoll_release_file(struct file *file) epi = list_entry(lsthead->next, struct epitem, fllink); ep = epi->ep; - EP_LIST_DEL(&epi->fllink); + ep_list_del(&epi->fllink); down_write(&ep->sem); ep_remove(ep, epi); up_write(&ep->sem); @@ -534,7 +559,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) current, epfd, op, fd, event)); error = -EFAULT; - if (EP_OP_HASH_EVENT(op) && + if (ep_op_hash_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto eexit_1; @@ -560,7 +585,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !IS_FILE_EPOLL(file)) + if (file == tfile || !is_file_epoll(file)) goto eexit_3; /* @@ -656,7 +681,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!IS_FILE_EPOLL(file)) + if (!is_file_epoll(file)) goto eexit_2; /* @@ -831,11 +856,11 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; - EP_SET_FFD(&ffd, file, fd); + ep_set_ffd(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); - kcmp = EP_CMP_FFD(&ffd, &epi->ffd); + kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) @@ -875,7 +900,7 @@ static void ep_release_epitem(struct epitem *epi) { if (atomic_dec_and_test(&epi->usecnt)) - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); } @@ -886,10 +911,10 @@ static void ep_release_epitem(struct epitem *epi) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { - struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); + struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) { + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, SLAB_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; @@ -912,7 +937,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); - kcmp = EP_CMP_FFD(&epi->ffd, &epic->ffd); + kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) p = &parent->rb_right; else @@ -932,17 +957,17 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct ep_pqueue epq; error = -ENOMEM; - if (!(epi = EPI_MEM_ALLOC())) + if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL))) goto eexit_1; /* Item initialization follow here ... */ - EP_RB_INITNODE(&epi->rbn); + ep_rb_initnode(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->txlink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; - EP_SET_FFD(&epi->ffd, tfile, fd); + ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; @@ -978,7 +1003,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ep_rbtree_insert(ep, epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { + if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -1007,11 +1032,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * allocated wait queue. */ write_lock_irqsave(&ep->lock, flags); - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); write_unlock_irqrestore(&ep->lock, flags); - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); eexit_1: return error; } @@ -1050,14 +1075,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * If the item is not linked to the hash it means that it's on its * way toward the removal. Do nothing in this case. */ - if (EP_RB_LINKED(&epi->rbn)) { + if (ep_rb_linked(&epi->rbn)) { /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. If the item is not "hot" and it is currently * registered inside the ready list, unlink it. */ if (revents & event->events) { - if (!EP_IS_LINKED(&epi->rdllink)) { + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -1097,9 +1122,9 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) while (!list_empty(lsthead)) { pwq = list_entry(lsthead->next, struct eppoll_entry, llink); - EP_LIST_DEL(&pwq->llink); + ep_list_del(&pwq->llink); remove_wait_queue(pwq->whead, &pwq->wait); - PWQ_MEM_FREE(pwq); + kmem_cache_free(pwq_cache, pwq); } } } @@ -1118,7 +1143,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * The check protect us from doing a double unlink ( crash ). */ error = -ENOENT; - if (!EP_RB_LINKED(&epi->rbn)) + if (!ep_rb_linked(&epi->rbn)) goto eexit_1; /* @@ -1133,14 +1158,14 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * This operation togheter with the above check closes the door to * double unlinks. */ - EP_RB_ERASE(&epi->rbn, &ep->rbr); + ep_rb_erase(&epi->rbn, &ep->rbr); /* * If the item we are going to remove is inside the ready file descriptors * we want to remove it from this list to avoid stale events. */ - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); error = 0; eexit_1: @@ -1174,8 +1199,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_ep_lock); - if (EP_IS_LINKED(&epi->fllink)) - EP_LIST_DEL(&epi->fllink); + if (ep_is_linked(&epi->fllink)) + ep_list_del(&epi->fllink); spin_unlock(&file->f_ep_lock); /* We need to acquire the write IRQ lock before calling ep_unlink() */ @@ -1210,7 +1235,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k { int pwake = 0; unsigned long flags; - struct epitem *epi = EP_ITEM_FROM_WAIT(wait); + struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", @@ -1228,7 +1253,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto is_disabled; /* If this file is already in the ready list we exit soon */ - if (EP_IS_LINKED(&epi->rdllink)) + if (ep_is_linked(&epi->rdllink)) goto is_linked; list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1307,7 +1332,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist lnk = lnk->next; /* If this file is already in the ready list we exit soon */ - if (!EP_IS_LINKED(&epi->txlink)) { + if (!ep_is_linked(&epi->txlink)) { /* * This is initialized in this way so that the default * behaviour of the reinjecting code will be to push back @@ -1322,7 +1347,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist /* * Unlink the item from the ready list. */ - EP_LIST_DEL(&epi->rdllink); + ep_list_del(&epi->rdllink); } } @@ -1401,7 +1426,7 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) epi = list_entry(txlist->next, struct epitem, txlink); /* Unlink the current item from the transfer list */ - EP_LIST_DEL(&epi->txlink); + ep_list_del(&epi->txlink); /* * If the item is no more linked to the interest set, we don't @@ -1410,8 +1435,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * item is set to have an Edge Triggered behaviour, we don't have * to push it back either. */ - if (EP_RB_LINKED(&epi->rbn) && !(epi->event.events & EPOLLET) && - (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) { + if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) && + (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ricnt++; } From 71a2224d7d1cefc23a1ac80bba421cc069cc3257 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:10:05 -0700 Subject: [PATCH 156/199] [PATCH] Optimize sys_times for a single thread process Avoid taking the tasklist_lock in sys_times if the process is single threaded. In a NUMA system taking the tasklist_lock may cause a bouncing cacheline if multiple independent processes continually call sys_times to measure their performance. Signed-off-by: Christoph Lameter Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +++ kernel/sys.c | 86 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index c2bdf6fb61a5..3ebcd60a19c6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,11 @@ void release_task(struct task_struct * p) BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); __exit_sighand(p); + /* + * Note that the fastpath in sys_times depends on __exit_signal having + * updated the counters before a task is removed from the tasklist of + * the process by __unhash_process. + */ __unhash_process(p); /* diff --git a/kernel/sys.c b/kernel/sys.c index 0a2c8cda9638..5a9d6b075016 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -894,35 +894,69 @@ asmlinkage long sys_times(struct tms __user * tbuf) */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; cputime_t utime, stime, cutime, cstime; - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); +#ifdef CONFIG_SMP + if (thread_group_empty(current)) { + /* + * Single thread case without the use of any locks. + * + * We may race with release_task if two threads are + * executing. However, release task first adds up the + * counters (__exit_signal) before removing the task + * from the process tasklist (__unhash_process). + * __exit_signal also acquires and releases the + * siglock which results in the proper memory ordering + * so that the list modifications are always visible + * after the counters have been updated. + * + * If the counters have been updated by the second thread + * but the thread has not yet been removed from the list + * then the other branch will be executing which will + * block on tasklist_lock until the exit handling of the + * other task is finished. + * + * This also implies that the sighand->siglock cannot + * be held by another processor. So we can also + * skip acquiring that lock. + */ + utime = cputime_add(current->signal->utime, current->utime); + stime = cputime_add(current->signal->utime, current->stime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + } else +#endif + { + + /* Process with multiple threads */ + struct task_struct *tsk = current; + struct task_struct *t; + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + } tmp.tms_utime = cputime_to_clock_t(utime); tmp.tms_stime = cputime_to_clock_t(stime); tmp.tms_cutime = cputime_to_clock_t(cutime); From 0030cbf06c669b65e124414af51b5010fc53b760 Mon Sep 17 00:00:00 2001 From: Matthias Urlichs Date: Thu, 23 Jun 2005 00:10:05 -0700 Subject: [PATCH 157/199] [PATCH] Turn off sibling call optimization w/ frame pointers Frame pointers are supposed to enable debuggers to reliably tell where a call comes from. That is defeated by GCC's sibling call optimization (aka tail recursion elimination). This patch turns this optimization off when compiling with frame pointers. Signed-Off-By: Matthias Urlichs Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d1e74d50067..fad349724e99 100644 --- a/Makefile +++ b/Makefile @@ -518,7 +518,7 @@ CFLAGS += $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops) CFLAGS += $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps) ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer +CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else CFLAGS += -fomit-frame-pointer endif From b78755abcdf1d7667c51580a3783e16e981ed926 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 23 Jun 2005 00:10:06 -0700 Subject: [PATCH 158/199] [PATCH] ipcsem: remove superflous decrease variable from sys_semtimedop Patrick noticed that the initial scan of the semaphore operations logs decrease and increase operations seperately, but then both cases are or'ed together and decrease is never used. The attached patch removes the decrease parameter - it shrinks sys_semtimedop() by 56 bytes. Signed-Of-By: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 5ad7ac0ed60d..7e8a25c82ef3 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1054,7 +1054,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf* sops = fast_sops, *sop; struct sem_undo *un; - int undos = 0, decrease = 0, alter = 0, max; + int undos = 0, alter = 0, max; struct sem_queue queue; unsigned long jiffies_left = 0; @@ -1089,13 +1089,10 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) - undos++; - if (sop->sem_op < 0) - decrease = 1; - if (sop->sem_op > 0) + undos = 1; + if (sop->sem_op != 0) alter = 1; } - alter |= decrease; retry_undos: if (undos) { From 17abee3d50685b8ef4e38d37abacf8b650a3a387 Mon Sep 17 00:00:00 2001 From: Thierry Vignaud Date: Thu, 23 Jun 2005 00:10:07 -0700 Subject: [PATCH 159/199] [PATCH] gconfig: only show scrollbars if needed gconfig: only show scrollbars if needed (which is more user friendly): Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/kconfig/gconf.glade | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/kconfig/gconf.glade b/scripts/kconfig/gconf.glade index 1e1736d81ee9..ace4706ab251 100644 --- a/scripts/kconfig/gconf.glade +++ b/scripts/kconfig/gconf.glade @@ -432,8 +432,8 @@ True - GTK_POLICY_ALWAYS - GTK_POLICY_ALWAYS + GTK_POLICY_AUTOMATIC + GTK_POLICY_AUTOMATIC GTK_SHADOW_IN GTK_CORNER_TOP_LEFT @@ -466,8 +466,8 @@ True - GTK_POLICY_ALWAYS - GTK_POLICY_ALWAYS + GTK_POLICY_AUTOMATIC + GTK_POLICY_AUTOMATIC GTK_SHADOW_IN GTK_CORNER_TOP_LEFT @@ -496,7 +496,7 @@ True GTK_POLICY_NEVER - GTK_POLICY_ALWAYS + GTK_POLICY_AUTOMATIC GTK_SHADOW_IN GTK_CORNER_TOP_LEFT From d9ad7ef1979d65a4200847ec91be5b9ad961eba8 Mon Sep 17 00:00:00 2001 From: TINNES Julien RD-MAPS-ISS Date: Thu, 23 Jun 2005 00:10:08 -0700 Subject: [PATCH 160/199] [PATCH] Potential null pointer dereference in amiga serial driver A pointer is dereferenced before it is null-checked. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/amiserial.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c index 1dc4259213a6..777bc499bbbd 100644 --- a/drivers/char/amiserial.c +++ b/drivers/char/amiserial.c @@ -861,13 +861,18 @@ static void change_speed(struct async_struct *info, static void rs_put_char(struct tty_struct *tty, unsigned char ch) { - struct async_struct *info = (struct async_struct *)tty->driver_data; + struct async_struct *info; unsigned long flags; + if (!tty) + return; + + info = tty->driver_data; + if (serial_paranoia_check(info, tty->name, "rs_put_char")) return; - if (!tty || !info->xmit.buf) + if (!info->xmit.buf) return; local_irq_save(flags); @@ -910,13 +915,18 @@ static void rs_flush_chars(struct tty_struct *tty) static int rs_write(struct tty_struct * tty, const unsigned char *buf, int count) { int c, ret = 0; - struct async_struct *info = (struct async_struct *)tty->driver_data; + struct async_struct *info; unsigned long flags; + if (!tty) + return 0; + + info = tty->driver_data; + if (serial_paranoia_check(info, tty->name, "rs_write")) return 0; - if (!tty || !info->xmit.buf || !tmp_buf) + if (!info->xmit.buf || !tmp_buf) return 0; local_save_flags(flags); From 7007cab53176dc906e67b4efa62f20ff1432805a Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 23 Jun 2005 00:10:09 -0700 Subject: [PATCH 161/199] [PATCH] Add offset.h to dontdiff include/asm/offset.h is a generated file on x86_64 and mips. Let's add it to Documentation/dontdiff. Signed-off-by: Michal Schmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dontdiff | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 9a33bb94f74f..d4fda25db868 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -111,6 +111,7 @@ mkdep mktables modpost modversions.h* +offset.h offsets.h oui.c* parse.c* From fa912bcb06d5dc9525d8912a145db2bf4b7668c5 Mon Sep 17 00:00:00 2001 From: Daniel Ritz Date: Thu, 23 Jun 2005 00:10:12 -0700 Subject: [PATCH 162/199] [PATCH] yenta TI: turn off interrupts during card power-on #2 - make boot-up card recognition more reliable (ie. redo interrogation always if there is no valid 'card inserted' state) (and yes, i saw it happening on an o2micro controller that both CB_CBARD and CB_16BITCARD bits were set at the same time) - also redo interrogation before probing the ISA interrupts. it's safer to do the probing with the socket in a clean state. - make card insert detect more reliable. yenta_get_status() now returns SS_PENDING as long as the card is not completley inserted and one of the voltage bits is set. also !CB_CBARD doesn't mean CB_16BITCARD. there is CB_NOTACARD as well, so make an explicit check for CB_16BITCARD. - for TI bridges: disable IRQs during power-on. in all-serial and tied interrupt mode the interrupts are always disabled for single-slot controllers. for two-slot contollers the disabling is only done when the other slot is empty. to force disabling there is a new module parameter now: pwr_irqs_off=Y (which is a regression for working setups. that's why it's an option, only use when required) - modparm to disable ISA interrupt probing (isa_probe, defaults to on) - remove unneeded code/cleanups (ie. merge yenta_events() into yenta_interrupts()) Signed-off-by: Daniel Ritz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pcmcia/cs.c | 11 ++- drivers/pcmcia/ti113x.h | 167 ++++++++++++++++++++++++++++++++++ drivers/pcmcia/yenta_socket.c | 60 +++++++----- include/pcmcia/ss.h | 8 ++ 4 files changed, 223 insertions(+), 23 deletions(-) diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c index 03fc885db1c5..d136b3c8fac9 100644 --- a/drivers/pcmcia/cs.c +++ b/drivers/pcmcia/cs.c @@ -508,6 +508,10 @@ static int socket_setup(struct pcmcia_socket *skt, int initial_delay) cs_err(skt, "unsupported voltage key.\n"); return CS_BAD_TYPE; } + + if (skt->power_hook) + skt->power_hook(skt, HOOK_POWER_PRE); + skt->socket.flags = 0; skt->ops->set_socket(skt, &skt->socket); @@ -522,7 +526,12 @@ static int socket_setup(struct pcmcia_socket *skt, int initial_delay) return CS_BAD_TYPE; } - return socket_reset(skt); + status = socket_reset(skt); + + if (skt->power_hook) + skt->power_hook(skt, HOOK_POWER_POST); + + return status; } /* diff --git a/drivers/pcmcia/ti113x.h b/drivers/pcmcia/ti113x.h index a8a1d104524a..c7ba99871aca 100644 --- a/drivers/pcmcia/ti113x.h +++ b/drivers/pcmcia/ti113x.h @@ -611,6 +611,170 @@ static void ti12xx_irqroute_func1(struct yenta_socket *socket) } } + +/* Returns true value if the second slot of a two-slot controller is empty */ +static int ti12xx_2nd_slot_empty(struct yenta_socket *socket) +{ + struct pci_dev *func; + struct yenta_socket *slot2; + int devfn; + unsigned int state; + int ret = 1; + + /* catch the two-slot controllers */ + switch (socket->dev->device) { + case PCI_DEVICE_ID_TI_1220: + case PCI_DEVICE_ID_TI_1221: + case PCI_DEVICE_ID_TI_1225: + case PCI_DEVICE_ID_TI_1251A: + case PCI_DEVICE_ID_TI_1251B: + case PCI_DEVICE_ID_TI_1420: + case PCI_DEVICE_ID_TI_1450: + case PCI_DEVICE_ID_TI_1451A: + case PCI_DEVICE_ID_TI_1520: + case PCI_DEVICE_ID_TI_1620: + case PCI_DEVICE_ID_TI_4520: + case PCI_DEVICE_ID_TI_4450: + case PCI_DEVICE_ID_TI_4451: + /* + * there are way more, but they need to be added in yenta_socket.c + * and pci_ids.h first anyway. + */ + break; + + /* single-slot controllers have the 2nd slot empty always :) */ + default: + return 1; + } + + /* get other slot */ + devfn = socket->dev->devfn & ~0x07; + func = pci_get_slot(socket->dev->bus, + (socket->dev->devfn & 0x07) ? devfn : devfn | 0x01); + if (!func) + return 1; + + slot2 = pci_get_drvdata(func); + if (!slot2) + goto out; + + /* check state */ + yenta_get_status(&socket->socket, &state); + if (state & SS_DETECT) { + ret = 0; + goto out; + } + +out: + pci_dev_put(func); + return ret; +} + +/* + * TI specifiy parts for the power hook. + * + * some TI's with some CB's produces interrupt storm on power on. it has been + * seen with atheros wlan cards on TI1225 and TI1410. solution is simply to + * disable any CB interrupts during this time. + */ +static int ti12xx_power_hook(struct pcmcia_socket *sock, int operation) +{ + struct yenta_socket *socket = container_of(sock, struct yenta_socket, socket); + u32 mfunc, devctl, sysctl; + u8 gpio3; + + /* only POWER_PRE and POWER_POST are interesting */ + if ((operation != HOOK_POWER_PRE) && (operation != HOOK_POWER_POST)) + return 0; + + devctl = config_readb(socket, TI113X_DEVICE_CONTROL); + sysctl = config_readl(socket, TI113X_SYSTEM_CONTROL); + mfunc = config_readl(socket, TI122X_MFUNC); + + /* + * all serial/tied: only disable when modparm set. always doing it + * would mean a regression for working setups 'cos it disables the + * interrupts for both both slots on 2-slot controllers + * (and users of single slot controllers where it's save have to + * live with setting the modparm, most don't have to anyway) + */ + if (((devctl & TI113X_DCR_IMODE_MASK) == TI12XX_DCR_IMODE_ALL_SERIAL) && + (pwr_irqs_off || ti12xx_2nd_slot_empty(socket))) { + switch (socket->dev->device) { + case PCI_DEVICE_ID_TI_1250: + case PCI_DEVICE_ID_TI_1251A: + case PCI_DEVICE_ID_TI_1251B: + case PCI_DEVICE_ID_TI_1450: + case PCI_DEVICE_ID_TI_1451A: + case PCI_DEVICE_ID_TI_4450: + case PCI_DEVICE_ID_TI_4451: + /* these chips have no IRQSER setting in MFUNC3 */ + break; + + default: + if (operation == HOOK_POWER_PRE) + mfunc = (mfunc & ~TI122X_MFUNC3_MASK); + else + mfunc = (mfunc & ~TI122X_MFUNC3_MASK) | TI122X_MFUNC3_IRQSER; + } + + return 0; + } + + /* do the job differently for func0/1 */ + if ((PCI_FUNC(socket->dev->devfn) == 0) || + ((sysctl & TI122X_SCR_INTRTIE) && + (pwr_irqs_off || ti12xx_2nd_slot_empty(socket)))) { + /* some bridges are different */ + switch (socket->dev->device) { + case PCI_DEVICE_ID_TI_1250: + case PCI_DEVICE_ID_TI_1251A: + case PCI_DEVICE_ID_TI_1251B: + case PCI_DEVICE_ID_TI_1450: + /* those oldies use gpio3 for INTA */ + gpio3 = config_readb(socket, TI1250_GPIO3_CONTROL); + if (operation == HOOK_POWER_PRE) + gpio3 = (gpio3 & ~TI1250_GPIO_MODE_MASK) | 0x40; + else + gpio3 &= ~TI1250_GPIO_MODE_MASK; + config_writeb(socket, TI1250_GPIO3_CONTROL, gpio3); + break; + + default: + /* all new bridges are the same */ + if (operation == HOOK_POWER_PRE) + mfunc &= ~TI122X_MFUNC0_MASK; + else + mfunc |= TI122X_MFUNC0_INTA; + config_writel(socket, TI122X_MFUNC, mfunc); + } + } else { + switch (socket->dev->device) { + case PCI_DEVICE_ID_TI_1251A: + case PCI_DEVICE_ID_TI_1251B: + case PCI_DEVICE_ID_TI_1450: + /* those have INTA elsewhere and INTB in MFUNC0 */ + if (operation == HOOK_POWER_PRE) + mfunc &= ~TI122X_MFUNC0_MASK; + else + mfunc |= TI125X_MFUNC0_INTB; + config_writel(socket, TI122X_MFUNC, mfunc); + + break; + + default: + /* all new bridges are the same */ + if (operation == HOOK_POWER_PRE) + mfunc &= ~TI122X_MFUNC1_MASK; + else + mfunc |= TI122X_MFUNC1_INTB; + config_writel(socket, TI122X_MFUNC, mfunc); + } + } + + return 0; +} + static int ti12xx_override(struct yenta_socket *socket) { u32 val, val_orig; @@ -654,6 +818,9 @@ static int ti12xx_override(struct yenta_socket *socket) else ti12xx_irqroute_func1(socket); + /* install power hook */ + socket->socket.power_hook = ti12xx_power_hook; + return ti_override(socket); } diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 6404d97a12eb..bee05362fd24 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -32,6 +32,14 @@ static int disable_clkrun; module_param(disable_clkrun, bool, 0444); MODULE_PARM_DESC(disable_clkrun, "If PC card doesn't function properly, please try this option"); +static int isa_probe = 1; +module_param(isa_probe, bool, 0444); +MODULE_PARM_DESC(isa_probe, "If set ISA interrupts are probed (default). Set to N to disable probing"); + +static int pwr_irqs_off; +module_param(pwr_irqs_off, bool, 0644); +MODULE_PARM_DESC(pwr_irqs_off, "Force IRQs off during power-on of slot. Use only when seeing IRQ storms!"); + #if 0 #define debug(x,args...) printk(KERN_DEBUG "%s: " x, __func__ , ##args) #else @@ -150,15 +158,16 @@ static int yenta_get_status(struct pcmcia_socket *sock, unsigned int *value) val = (state & CB_3VCARD) ? SS_3VCARD : 0; val |= (state & CB_XVCARD) ? SS_XVCARD : 0; - val |= (state & (CB_CDETECT1 | CB_CDETECT2 | CB_5VCARD | CB_3VCARD - | CB_XVCARD | CB_YVCARD)) ? 0 : SS_PENDING; + val |= (state & (CB_5VCARD | CB_3VCARD | CB_XVCARD | CB_YVCARD)) ? 0 : SS_PENDING; + val |= (state & (CB_CDETECT1 | CB_CDETECT2)) ? SS_PENDING : 0; + if (state & CB_CBCARD) { val |= SS_CARDBUS; val |= (state & CB_CARDSTS) ? SS_STSCHG : 0; val |= (state & (CB_CDETECT1 | CB_CDETECT2)) ? 0 : SS_DETECT; val |= (state & CB_PWRCYCLE) ? SS_POWERON | SS_READY : 0; - } else { + } else if (state & CB_16BITCARD) { u8 status = exca_readb(socket, I365_STATUS); val |= ((status & I365_CS_DETECT) == I365_CS_DETECT) ? SS_DETECT : 0; if (exca_readb(socket, I365_INTCTL) & I365_PC_IOCARD) { @@ -405,11 +414,13 @@ static int yenta_set_mem_map(struct pcmcia_socket *sock, struct pccard_mem_map * } -static unsigned int yenta_events(struct yenta_socket *socket) + +static irqreturn_t yenta_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + unsigned int events; + struct yenta_socket *socket = (struct yenta_socket *) dev_id; u8 csc; u32 cb_event; - unsigned int events; /* Clear interrupt status for the event */ cb_event = cb_readl(socket, CB_SOCKET_EVENT); @@ -426,20 +437,13 @@ static unsigned int yenta_events(struct yenta_socket *socket) events |= (csc & I365_CSC_BVD2) ? SS_BATWARN : 0; events |= (csc & I365_CSC_READY) ? SS_READY : 0; } - return events; -} - - -static irqreturn_t yenta_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - unsigned int events; - struct yenta_socket *socket = (struct yenta_socket *) dev_id; - events = yenta_events(socket); - if (events) { + if (events) pcmcia_parse_events(&socket->socket, events); + + if (cb_event || csc) return IRQ_HANDLED; - } + return IRQ_NONE; } @@ -470,11 +474,22 @@ static void yenta_clear_maps(struct yenta_socket *socket) } } +/* redoes voltage interrogation if required */ +static void yenta_interrogate(struct yenta_socket *socket) +{ + u32 state; + + state = cb_readl(socket, CB_SOCKET_STATE); + if (!(state & (CB_5VCARD | CB_3VCARD | CB_XVCARD | CB_YVCARD)) || + (state & (CB_CDETECT1 | CB_CDETECT2 | CB_NOTACARD | CB_BADVCCREQ)) || + ((state & (CB_16BITCARD | CB_CBCARD)) == (CB_16BITCARD | CB_CBCARD))) + cb_writel(socket, CB_SOCKET_FORCE, CB_CVSTEST); +} + /* Called at resume and initialization events */ static int yenta_sock_init(struct pcmcia_socket *sock) { struct yenta_socket *socket = container_of(sock, struct yenta_socket, socket); - u32 state; u16 bridge; bridge = config_readw(socket, CB_BRIDGE_CONTROL) & ~CB_BRIDGE_INTR; @@ -486,10 +501,7 @@ static int yenta_sock_init(struct pcmcia_socket *sock) exca_writeb(socket, I365_GENCTL, 0x00); /* Redo card voltage interrogation */ - state = cb_readl(socket, CB_SOCKET_STATE); - if (!(state & (CB_CDETECT1 | CB_CDETECT2 | CB_5VCARD | - CB_3VCARD | CB_XVCARD | CB_YVCARD))) - cb_writel(socket, CB_SOCKET_FORCE, CB_CVSTEST); + yenta_interrogate(socket); yenta_clear_maps(socket); @@ -856,7 +868,10 @@ static void yenta_get_socket_capabilities(struct yenta_socket *socket, u32 isa_i socket->socket.features |= SS_CAP_PAGE_REGS | SS_CAP_PCCARD | SS_CAP_CARDBUS; socket->socket.map_size = 0x1000; socket->socket.pci_irq = socket->cb_irq; - socket->socket.irq_mask = yenta_probe_irq(socket, isa_irq_mask); + if (isa_probe) + socket->socket.irq_mask = yenta_probe_irq(socket, isa_irq_mask); + else + socket->socket.irq_mask = 0; socket->socket.cb_dev = socket->dev; printk(KERN_INFO "Yenta: ISA IRQ mask 0x%04x, PCI irq %d\n", @@ -996,6 +1011,7 @@ static int __devinit yenta_probe (struct pci_dev *dev, const struct pci_device_i } /* Figure out what the dang thing can do for the PCMCIA layer... */ + yenta_interrogate(socket); yenta_get_socket_capabilities(socket, isa_interrupts); printk(KERN_INFO "Socket status: %08x\n", cb_readl(socket, CB_SOCKET_STATE)); diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h index 6d3413a56708..67b867f31fe4 100644 --- a/include/pcmcia/ss.h +++ b/include/pcmcia/ss.h @@ -77,6 +77,11 @@ extern socket_state_t dead_socket; /* Use this just for bridge windows */ #define MAP_IOSPACE 0x20 +/* power hook operations */ +#define HOOK_POWER_PRE 0x01 +#define HOOK_POWER_POST 0x02 + + typedef struct pccard_io_map { u_char map; u_char flags; @@ -222,6 +227,9 @@ struct pcmcia_socket { /* Zoom video behaviour is so chip specific its not worth adding this to _ops */ void (*zoom_video)(struct pcmcia_socket *, int); + + /* so is power hook */ + int (*power_hook)(struct pcmcia_socket *sock, int operation); /* state thread */ struct semaphore skt_sem; /* protects socket h/w state */ From 0d77e5a2c23da734f5a7925f64afa1c2ed92e0f9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 23 Jun 2005 00:10:14 -0700 Subject: [PATCH 163/199] [PATCH] compat: introduce compat_time_t This patch is based on work by Carlos O'Donell and Matthew Wilcox. It introduces/updates the compat_time_t type and uses it for compat siginfo structures. I have built this on ppc64 and x86_64. Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/ia32/ia32priv.h | 2 +- arch/s390/kernel/compat_linux.h | 2 +- arch/sparc64/kernel/signal32.c | 2 +- include/asm-ia64/compat.h | 1 + include/asm-mips/compat.h | 1 + include/asm-parisc/compat.h | 2 +- include/asm-ppc64/compat.h | 1 + include/asm-ppc64/ppc32.h | 2 +- include/asm-sparc64/compat.h | 1 + include/asm-x86_64/ia32.h | 2 +- 10 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/ia64/ia32/ia32priv.h b/arch/ia64/ia32/ia32priv.h index b2de948bdaea..e3e9290e3ff2 100644 --- a/arch/ia64/ia32/ia32priv.h +++ b/arch/ia64/ia32/ia32priv.h @@ -241,7 +241,7 @@ typedef struct compat_siginfo { /* POSIX.1b timers */ struct { - timer_t _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ char _pad[sizeof(unsigned int) - sizeof(int)]; compat_sigval_t _sigval; /* same as below */ diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index bf33dcfec7db..3898f66d0b2f 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -45,7 +45,7 @@ typedef struct compat_siginfo { /* POSIX.1b timers */ struct { - timer_t _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index 9a375e975cff..f28428f4170e 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -102,7 +102,7 @@ typedef struct compat_siginfo{ /* POSIX.1b timers */ struct { - timer_t _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ diff --git a/include/asm-ia64/compat.h b/include/asm-ia64/compat.h index cc0ff0a4bdd0..0c05e5bad8a0 100644 --- a/include/asm-ia64/compat.h +++ b/include/asm-ia64/compat.h @@ -27,6 +27,7 @@ typedef u16 compat_ipc_pid_t; typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h index dce92079e7fc..d78002afb1e1 100644 --- a/include/asm-mips/compat.h +++ b/include/asm-mips/compat.h @@ -29,6 +29,7 @@ typedef s32 compat_caddr_t; typedef struct { s32 val[2]; } compat_fsid_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h index ca0eac647a05..7630d1ad2391 100644 --- a/include/asm-parisc/compat.h +++ b/include/asm-parisc/compat.h @@ -24,7 +24,7 @@ typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; -typedef u32 compat_timer_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-ppc64/compat.h b/include/asm-ppc64/compat.h index 09c28d28ce6c..12414f5fc666 100644 --- a/include/asm-ppc64/compat.h +++ b/include/asm-ppc64/compat.h @@ -26,6 +26,7 @@ typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; typedef s32 compat_key_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h index 1d0404897550..6b44a8caf395 100644 --- a/include/asm-ppc64/ppc32.h +++ b/include/asm-ppc64/ppc32.h @@ -32,7 +32,7 @@ typedef struct compat_siginfo { /* POSIX.1b timers */ struct { - timer_t _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ diff --git a/include/asm-sparc64/compat.h b/include/asm-sparc64/compat.h index 22f58055b8ab..b59122dd176d 100644 --- a/include/asm-sparc64/compat.h +++ b/include/asm-sparc64/compat.h @@ -25,6 +25,7 @@ typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; typedef s32 compat_key_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h index c0a7717923ed..6efa00fe4e7b 100644 --- a/include/asm-x86_64/ia32.h +++ b/include/asm-x86_64/ia32.h @@ -94,7 +94,7 @@ typedef struct compat_siginfo{ /* POSIX.1b timers */ struct { - int _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 00:10:15 -0700 Subject: [PATCH 164/199] [PATCH] block: add unlocked_ioctl support for block devices This patch allows block device drivers to convert their ioctl functions to unlocked_ioctl() like character devices and other subsystems. All functions that were called with the BKL held before are still used that way, but I would not be surprised if it could be removed from the ioctl functions in drivers/block/ioctl.c themselves. As a side note, I found that compat_blkdev_ioctl() acquires the BKL as well, which looks like a bug. I have checked that every user of disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so it could easily be removed from compat_blkdev_ioctl(). Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/ioctl.c | 74 +++++++++++++++++++++++++++++++------------ fs/block_dev.c | 5 ++- include/linux/fs.h | 1 + 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c index 6d7bcc9da9e7..6e278474f9a8 100644 --- a/drivers/block/ioctl.c +++ b/drivers/block/ioctl.c @@ -133,11 +133,9 @@ static int put_u64(unsigned long arg, u64 val) return put_user(val, (u64 __user *)arg); } -int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, - unsigned long arg) +static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev, + unsigned cmd, unsigned long arg) { - struct block_device *bdev = inode->i_bdev; - struct gendisk *disk = bdev->bd_disk; struct backing_dev_info *bdi; int ret, n; @@ -190,36 +188,72 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, return put_ulong(arg, bdev->bd_inode->i_size >> 9); case BLKGETSIZE64: return put_u64(arg, bdev->bd_inode->i_size); + } + return -ENOIOCTLCMD; +} + +static int blkdev_driver_ioctl(struct inode *inode, struct file *file, + struct gendisk *disk, unsigned cmd, unsigned long arg) +{ + int ret; + if (disk->fops->unlocked_ioctl) + return disk->fops->unlocked_ioctl(file, cmd, arg); + + if (disk->fops->ioctl) { + lock_kernel(); + ret = disk->fops->ioctl(inode, file, cmd, arg); + unlock_kernel(); + return ret; + } + + return -ENOTTY; +} + +int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, + unsigned long arg) +{ + struct block_device *bdev = inode->i_bdev; + struct gendisk *disk = bdev->bd_disk; + int ret, n; + + switch(cmd) { case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (disk->fops->ioctl) { - ret = disk->fops->ioctl(inode, file, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) - return ret; - } + + ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg); + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) + return ret; + + lock_kernel(); fsync_bdev(bdev); invalidate_bdev(bdev, 0); + unlock_kernel(); return 0; + case BLKROSET: - if (disk->fops->ioctl) { - ret = disk->fops->ioctl(inode, file, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) - return ret; - } + ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg); + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) + return ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (get_user(n, (int __user *)(arg))) return -EFAULT; + lock_kernel(); set_device_ro(bdev, n); + unlock_kernel(); return 0; - default: - if (disk->fops->ioctl) - return disk->fops->ioctl(inode, file, cmd, arg); } - return -ENOTTY; + + lock_kernel(); + ret = blkdev_locked_ioctl(file, bdev, cmd, arg); + unlock_kernel(); + if (ret != -ENOIOCTLCMD) + return ret; + + return blkdev_driver_ioctl(inode, file, disk, cmd, arg); } /* Most of the generic ioctls are handled in the normal fallback path. diff --git a/fs/block_dev.c b/fs/block_dev.c index c0cbd1bc1a02..e0df94c37b7e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -777,8 +777,7 @@ static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf, return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); } -static int block_ioctl(struct inode *inode, struct file *file, unsigned cmd, - unsigned long arg) +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); } @@ -803,7 +802,7 @@ struct file_operations def_blk_fops = { .aio_write = blkdev_file_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, - .ioctl = block_ioctl, + .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 3622e952e98c..9b1278e21279 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -884,6 +884,7 @@ struct block_device_operations { int (*open) (struct inode *, struct file *); int (*release) (struct inode *, struct file *); int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); long (*compat_ioctl) (struct file *, unsigned, unsigned long); int (*media_changed) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); From 280dedb8d64ccfe1166ae03d3b254fc3b65de6a5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 23 Jun 2005 00:10:16 -0700 Subject: [PATCH 165/199] [PATCH] PCDP: handle tables that don't supply baud rate The HCDP specs (i.e., PCDP revision < 3) allow zero as a default value for baud rate and data bits. So if firmware doesn't supply them, let early_serial_console_init() probe for them rather than telling it the baud rate is zero. Also, update the URL for the PCDP spec. Signed-off-by: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/pcdp.c | 11 +++++++---- drivers/firmware/pcdp.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c index df1b721154d2..839b44a7e08b 100644 --- a/drivers/firmware/pcdp.c +++ b/drivers/firmware/pcdp.c @@ -23,12 +23,15 @@ setup_serial_console(struct pcdp_uart *uart) { #ifdef CONFIG_SERIAL_8250_CONSOLE int mmio; - static char options[64]; + static char options[64], *p = options; mmio = (uart->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY); - snprintf(options, sizeof(options), "console=uart,%s,0x%lx,%lun%d", - mmio ? "mmio" : "io", uart->addr.address, uart->baud, - uart->bits ? uart->bits : 8); + p += sprintf(p, "console=uart,%s,0x%lx", + mmio ? "mmio" : "io", uart->addr.address); + if (uart->baud) + p += sprintf(p, ",%lu", uart->baud); + if (uart->bits) + p += sprintf(p, "n%d", uart->bits); return early_serial_console_init(options); #else diff --git a/drivers/firmware/pcdp.h b/drivers/firmware/pcdp.h index 863bb6f768c3..1dc7c88b7b4d 100644 --- a/drivers/firmware/pcdp.h +++ b/drivers/firmware/pcdp.h @@ -2,7 +2,7 @@ * Definitions for PCDP-defined console devices * * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf - * v2.0: http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf + * v2.0: http://www.dig64.org/specifications/DIG64_PCDPv20.pdf * * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P. * Khalid Aziz From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:10:17 -0700 Subject: [PATCH 166/199] [PATCH] Remove f_error field from struct file The following patch removes the f_error field and all checks of f_error. Trond said: f_error was introduced for NFS, and made sense when we were guaranteed always to have a file pointer around when write errors occurred. Since then, we have (for various reasons) had to introduce the nfs_open_context in order to track the file read/write state, and it made sense to move our f_error tracking there too. Signed-off-by: Christoph Lameter Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 5 ----- fs/open.c | 16 ++++------------ include/linux/fs.h | 1 - mm/filemap.c | 6 ------ 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d6a30c844de3..6537f2c4ae44 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -751,11 +751,6 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, retval = -EFAULT; if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) goto out; - if (file->f_error) { - retval = file->f_error; - file->f_error = 0; - goto out; - } retval = -EFBIG; if (limit != RLIM_INFINITY) { if (pos >= limit) { diff --git a/fs/open.c b/fs/open.c index 2ebb72c1a876..5dd411b084bf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -981,23 +981,15 @@ asmlinkage long sys_creat(const char __user * pathname, int mode) */ int filp_close(struct file *filp, fl_owner_t id) { - int retval; - - /* Report and clear outstanding errors */ - retval = filp->f_error; - if (retval) - filp->f_error = 0; + int retval = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); - return retval; + return 0; } - if (filp->f_op && filp->f_op->flush) { - int err = filp->f_op->flush(filp); - if (!retval) - retval = err; - } + if (filp->f_op && filp->f_op->flush) + retval = filp->f_op->flush(filp); dnotify_flush(filp, id); locks_remove_posix(filp, id); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b1278e21279..517bf4966bf5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -581,7 +581,6 @@ struct file { atomic_t f_count; unsigned int f_flags; mode_t f_mode; - int f_error; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; diff --git a/mm/filemap.c b/mm/filemap.c index 4a2fee2cb62b..a3598b542a31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1827,12 +1827,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos < 0)) return -EINVAL; - if (unlikely(file->f_error)) { - int err = file->f_error; - file->f_error = 0; - return err; - } - if (!isblk) { /* FIXME: this is for backwards compatibility with 2.4 */ if (file->f_flags & O_APPEND) From 30aaa80885df5d1518d498a450e73282cea8d8b1 Mon Sep 17 00:00:00 2001 From: William Lee Irwin III Date: Thu, 23 Jun 2005 00:10:18 -0700 Subject: [PATCH 167/199] [PATCH] use drivers/Kconfig for sparc32 Kconfig is spitting out massive numbers of errors and so on. This patch switches arch/sparc/Kconfig to use drivers/Kconfig so those stop. Signed-off-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/Kconfig | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 310d3f0c445e..262e13d086fe 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -264,7 +264,11 @@ config SUNOS_EMUL want to run SunOS binaries on an Ultra you must also say Y to "Kernel support for 32-bit a.out binaries" above. -source "drivers/parport/Kconfig" +source "mm/Kconfig" + +endmenu + +source "drivers/Kconfig" config PRINTER tristate "Parallel printer support" @@ -374,18 +378,8 @@ config UNIX98_PTY_COUNT endmenu -source "drivers/input/Kconfig" - source "fs/Kconfig" -source "sound/Kconfig" - -source "drivers/usb/Kconfig" - -source "drivers/infiniband/Kconfig" - -source "drivers/char/watchdog/Kconfig" - source "arch/sparc/Kconfig.debug" source "security/Kconfig" From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH 168/199] [PATCH] acl endianess annotations Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix_acl_xattr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 5efd0a6dad94..fe271c1947b2 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -23,13 +23,13 @@ #define ACL_UNDEFINED_ID (-1) typedef struct { - __u16 e_tag; - __u16 e_perm; - __u32 e_id; + __le16 e_tag; + __le16 e_perm; + __le32 e_id; } posix_acl_xattr_entry; typedef struct { - __u32 a_version; + __le32 a_version; posix_acl_xattr_entry a_entries[0]; } posix_acl_xattr_header; From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH 169/199] [PATCH] remove This file duplicates , using slightly different names. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/boot/simple/misc.c | 2 +- arch/ppc/boot/simple/mpc10x_memory.c | 2 +- fs/ext2/acl.c | 12 ++++++------ fs/ext2/acl.h | 2 +- fs/ext3/acl.c | 12 ++++++------ fs/ext3/acl.h | 2 +- fs/jfs/acl.c | 11 ++++++----- fs/jfs/jfs_acl.h | 2 -- fs/jfs/super.c | 1 + fs/jfs/xattr.c | 7 ++++--- fs/nfsd/vfs.c | 9 ++++----- fs/reiserfs/xattr_acl.c | 26 +++++++++++++------------- include/linux/posix_acl_xattr.h | 3 +++ include/linux/reiserfs_acl.h | 1 - 14 files changed, 47 insertions(+), 45 deletions(-) diff --git a/arch/ppc/boot/simple/misc.c b/arch/ppc/boot/simple/misc.c index ab0f9902cb67..e02de5b467a4 100644 --- a/arch/ppc/boot/simple/misc.c +++ b/arch/ppc/boot/simple/misc.c @@ -222,7 +222,7 @@ decompress_kernel(unsigned long load_addr, int num_words, unsigned long cksum) puts("\n"); puts("Uncompressing Linux..."); - gunzip(0x0, 0x400000, zimage_start, &zimage_size); + gunzip(NULL, 0x400000, zimage_start, &zimage_size); puts("done.\n"); /* get the bi_rec address */ diff --git a/arch/ppc/boot/simple/mpc10x_memory.c b/arch/ppc/boot/simple/mpc10x_memory.c index 977daedc14c0..20d92a34ceb8 100644 --- a/arch/ppc/boot/simple/mpc10x_memory.c +++ b/arch/ppc/boot/simple/mpc10x_memory.c @@ -33,7 +33,7 @@ #define MPC10X_PCI_OP(rw, size, type, op, mask) \ static void \ -mpc10x_##rw##_config_##size(unsigned int *cfg_addr, \ +mpc10x_##rw##_config_##size(unsigned int __iomem *cfg_addr, \ unsigned int *cfg_data, int devfn, int offset, \ type val) \ { \ diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 25f4a64fd6bc..213148c36ebe 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -396,12 +396,12 @@ static size_t ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_size) - memcpy(list, XATTR_NAME_ACL_ACCESS, size); + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); return size; } @@ -409,12 +409,12 @@ static size_t ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_size) - memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); return size; } @@ -506,14 +506,14 @@ ext2_xattr_set_acl_default(struct inode *inode, const char *name, } struct xattr_handler ext2_xattr_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .list = ext2_xattr_list_acl_access, .get = ext2_xattr_get_acl_access, .set = ext2_xattr_set_acl_access, }; struct xattr_handler ext2_xattr_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .list = ext2_xattr_list_acl_default, .get = ext2_xattr_get_acl_default, .set = ext2_xattr_set_acl_default, diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index fed96ae81a7d..0bde85bafe38 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -4,7 +4,7 @@ (C) 2001 Andreas Gruenbacher, */ -#include +#include #define EXT2_ACL_VERSION 0x0001 diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 638c13a26c03..133f5aa581bb 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -417,12 +417,12 @@ static size_t ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_len) - memcpy(list, XATTR_NAME_ACL_ACCESS, size); + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); return size; } @@ -430,12 +430,12 @@ static size_t ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_len) - memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); return size; } @@ -535,14 +535,14 @@ ext3_xattr_set_acl_default(struct inode *inode, const char *name, } struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .list = ext3_xattr_list_acl_access, .get = ext3_xattr_get_acl_access, .set = ext3_xattr_set_acl_access, }; struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .list = ext3_xattr_list_acl_default, .get = ext3_xattr_get_acl_default, .set = ext3_xattr_set_acl_default, diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 98af0c0d0ba9..92d50b53a933 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -4,7 +4,7 @@ (C) 2001 Andreas Gruenbacher, */ -#include +#include #define EXT3_ACL_VERSION 0x0001 diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 30a2bf9eeda5..e892dab40c26 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_xattr.h" #include "jfs_acl.h" @@ -36,11 +37,11 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) switch(type) { case ACL_TYPE_ACCESS: - ea_name = XATTR_NAME_ACL_ACCESS; + ea_name = POSIX_ACL_XATTR_ACCESS; p_acl = &ji->i_acl; break; case ACL_TYPE_DEFAULT: - ea_name = XATTR_NAME_ACL_DEFAULT; + ea_name = POSIX_ACL_XATTR_DEFAULT; p_acl = &ji->i_default_acl; break; default: @@ -88,11 +89,11 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) switch(type) { case ACL_TYPE_ACCESS: - ea_name = XATTR_NAME_ACL_ACCESS; + ea_name = POSIX_ACL_XATTR_ACCESS; p_acl = &ji->i_acl; break; case ACL_TYPE_DEFAULT: - ea_name = XATTR_NAME_ACL_DEFAULT; + ea_name = POSIX_ACL_XATTR_DEFAULT; p_acl = &ji->i_default_acl; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; @@ -101,7 +102,7 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return -EINVAL; } if (acl) { - size = xattr_acl_size(acl->a_count); + size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index d2ae430adecf..a3acd3eec059 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,8 +20,6 @@ #ifdef CONFIG_JFS_POSIX_ACL -#include - int jfs_permission(struct inode *, int, struct nameidata *); int jfs_init_acl(struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 810a3653d8b3..ee32211288ce 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "jfs_incore.h" diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 6016373701a3..ee438d429d45 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -19,6 +19,7 @@ #include #include +#include #include #include "jfs_incore.h" #include "jfs_superblock.h" @@ -718,9 +719,9 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return -EPERM; /* - * XATTR_NAME_ACL_ACCESS is tied to i_mode + * POSIX_ACL_XATTR_ACCESS is tied to i_mode */ - if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) { + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { acl = posix_acl_from_xattr(value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); @@ -750,7 +751,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED; return 0; - } else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) { + } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { acl = posix_acl_from_xattr(value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ae3940dc85cc..de340ffd33c3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -50,7 +50,6 @@ #include #ifdef CONFIG_NFSD_V4 #include -#include #include #include #include @@ -425,13 +424,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; if (pacl) { - error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS); + error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); if (error < 0) goto out_nfserr; } if (dpacl) { - error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT); + error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); if (error < 0) goto out_nfserr; } @@ -498,7 +497,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; - pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS); + pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(pacl)) { @@ -508,7 +507,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac } if (S_ISDIR(inode->i_mode)) { - dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT); + dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) dpacl = NULL; else if (IS_ERR(dpacl)) { diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index e302071903a1..c312881c5f53 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -192,11 +192,11 @@ reiserfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = XATTR_NAME_ACL_ACCESS; + name = POSIX_ACL_XATTR_ACCESS; p_acl = &reiserfs_i->i_acl_access; break; case ACL_TYPE_DEFAULT: - name = XATTR_NAME_ACL_DEFAULT; + name = POSIX_ACL_XATTR_DEFAULT; p_acl = &reiserfs_i->i_acl_default; break; default: @@ -260,7 +260,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) switch (type) { case ACL_TYPE_ACCESS: - name = XATTR_NAME_ACL_ACCESS; + name = POSIX_ACL_XATTR_ACCESS; p_acl = &reiserfs_i->i_acl_access; if (acl) { mode_t mode = inode->i_mode; @@ -275,7 +275,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } break; case ACL_TYPE_DEFAULT: - name = XATTR_NAME_ACL_DEFAULT; + name = POSIX_ACL_XATTR_DEFAULT; p_acl = &reiserfs_i->i_acl_default; if (!S_ISDIR (inode->i_mode)) return acl ? -EACCES : 0; @@ -468,7 +468,7 @@ static int posix_acl_access_get(struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1) return -EINVAL; return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); } @@ -477,7 +477,7 @@ static int posix_acl_access_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1) return -EINVAL; return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); } @@ -487,7 +487,7 @@ posix_acl_access_del (struct inode *inode, const char *name) { struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); struct posix_acl **acl = &reiserfs_i->i_acl_access; - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1) return -EINVAL; if (!IS_ERR (*acl) && *acl) { posix_acl_release (*acl); @@ -510,7 +510,7 @@ posix_acl_access_list (struct inode *inode, const char *name, int namelen, char } struct reiserfs_xattr_handler posix_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .get = posix_acl_access_get, .set = posix_acl_access_set, .del = posix_acl_access_del, @@ -521,7 +521,7 @@ static int posix_acl_default_get (struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1) return -EINVAL; return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); } @@ -530,7 +530,7 @@ static int posix_acl_default_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1) return -EINVAL; return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); } @@ -540,7 +540,7 @@ posix_acl_default_del (struct inode *inode, const char *name) { struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); struct posix_acl **acl = &reiserfs_i->i_acl_default; - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1) return -EINVAL; if (!IS_ERR (*acl) && *acl) { posix_acl_release (*acl); @@ -563,7 +563,7 @@ posix_acl_default_list (struct inode *inode, const char *name, int namelen, char } struct reiserfs_xattr_handler posix_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .get = posix_acl_default_get, .set = posix_acl_default_set, .del = posix_acl_default_del, diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index fe271c1947b2..6e53c34035cd 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size) return size / sizeof(posix_acl_xattr_entry); } +struct posix_acl *posix_acl_from_xattr(const void *value, size_t size); +int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size); + #endif /* _POSIX_ACL_XATTR_H */ diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h index 2aef9c3f5ce8..0760507a545b 100644 --- a/include/linux/reiserfs_acl.h +++ b/include/linux/reiserfs_acl.h @@ -1,6 +1,5 @@ #include #include -#include #define REISERFS_ACL_VERSION 0x0001 From 152becd26e0563aefdbc4fd1fe491928efe92d1f Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 23 Jun 2005 00:10:21 -0700 Subject: [PATCH 170/199] [PATCH] Bug in error recovery in fs/buffer.c::__block_prepare_write() fs/buffer.c::__block_prepare_write() has broken error recovery. It calls the get_block() callback with "create = 1" and if that succeeds it immediately clears buffer_new on the just allocated buffer (which has buffer_new set). The bug is that if an error occurs and get_block() returns != 0, we break from this loop and go into recovery code. This code has this comment: /* Error case: */ /* * Zero out any newly allocated blocks to avoid exposing stale * data. If BH_New is set, we know that the block was newly * allocated in the above loop. */ So the intent is obviously good in that it wants to clear just allocated and hence not zeroed buffers. However the code recognises allocated buffers by checking for buffer_new being set. Unfortunately __block_prepare_write() as discussed above already cleared buffer_new on all allocated buffers thus no buffers will be cleared during error recovery and old data will be leaked. The simplest way I can see to fix this is to make the current recovery code work by _not_ clearing buffer_new after calling get_block() in __block_prepare_write(). We cannot safely allow buffer_new buffers to "leak out" of __block_prepare_write(), thus we simply do a quick loop over the buffers clearing buffer_new on each of them if it is set just before returning "success" from __block_prepare_write(). Signed-off-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 12bdb2791127..13e5938a64f6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1926,7 +1926,6 @@ static int __block_prepare_write(struct inode *inode, struct page *page, if (err) break; if (buffer_new(bh)) { - clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); if (PageUptodate(page)) { @@ -1968,9 +1967,14 @@ static int __block_prepare_write(struct inode *inode, struct page *page, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (!err) - return err; - + if (!err) { + bh = head; + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + return 0; + } /* Error case: */ /* * Zero out any newly allocated blocks to avoid exposing stale From 4452ea509e29df2f019bed2f7a1e0f5eea092b26 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jun 2005 00:10:26 -0700 Subject: [PATCH 171/199] [PATCH] dpt_i2o: fix waitqueue abuse The driver plays with waitqueue internals and fails to compile after Ben's "aio: make wait_queue ->task ->private" patch. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/dpt_i2o.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index 9cc0015b717d..a699c30b2662 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c @@ -1126,11 +1126,11 @@ static int adpt_i2o_post_wait(adpt_hba* pHba, u32* msg, int len, int timeout) struct adpt_i2o_post_wait_data *p1, *p2; struct adpt_i2o_post_wait_data *wait_data = kmalloc(sizeof(struct adpt_i2o_post_wait_data),GFP_KERNEL); - adpt_wait_queue_t wait; + DECLARE_WAITQUEUE(wait, current); - if(!wait_data){ + if (!wait_data) return -ENOMEM; - } + /* * The spin locking is needed to keep anyone from playing * with the queue pointers and id while we do the same @@ -1148,12 +1148,7 @@ static int adpt_i2o_post_wait(adpt_hba* pHba, u32* msg, int len, int timeout) wait_data->wq = &adpt_wq_i2o_post; wait_data->status = -ETIMEDOUT; - // this code is taken from kernel/sched.c:interruptible_sleep_on_timeout - wait.task = current; - init_waitqueue_entry(&wait, current); - spin_lock_irqsave(&adpt_wq_i2o_post.lock, flags); - __add_wait_queue(&adpt_wq_i2o_post, &wait); - spin_unlock(&adpt_wq_i2o_post.lock); + add_wait_queue(&adpt_wq_i2o_post, &wait); msg[2] |= 0x80000000 | ((u32)wait_data->id); timeout *= HZ; @@ -1175,9 +1170,7 @@ static int adpt_i2o_post_wait(adpt_hba* pHba, u32* msg, int len, int timeout) if(pHba->host) spin_lock_irq(pHba->host->host_lock); } - spin_lock_irq(&adpt_wq_i2o_post.lock); - __remove_wait_queue(&adpt_wq_i2o_post, &wait); - spin_unlock_irqrestore(&adpt_wq_i2o_post.lock, flags); + remove_wait_queue(&adpt_wq_i2o_post, &wait); if(status == -ETIMEDOUT){ printk(KERN_INFO"dpti%d: POST WAIT TIMEOUT\n",pHba->unit); From 63e6880918e75dcb92d60aff218a76e063a471ef Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 23 Jun 2005 00:10:27 -0700 Subject: [PATCH 172/199] [PATCH] aio: fix do_sync_(read|write) to properly handle aio retries When do_sync_(read|write) encounters an aio method that makes use of the retry mechanism, they fail to correctly retry the operation. This fixes that by adding the appropriate sleep and retry mechanism. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/read_write.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index c4c2bee373ed..9292f5fa4d62 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -203,6 +203,16 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return -EINVAL; } +static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; @@ -210,7 +220,10 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos); + while (-EIOCBRETRY == + (ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos))) + wait_on_retry_sync_kiocb(&kiocb); + if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -258,7 +271,10 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos); + while (-EIOCBRETRY == + (ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos))) + wait_on_retry_sync_kiocb(&kiocb); + if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 23 Jun 2005 00:10:27 -0700 Subject: [PATCH 173/199] [PATCH] aio: make wait_queue ->task ->private In the upcoming aio_down patch, it is useful to store a private data pointer in the kiocb's wait_queue. Since we provide our own wake up function and do not require the task_struct pointer, it makes sense to convert the task pointer into a generic private pointer. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 16 ++++++++-------- kernel/sched.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index c9486c3efb4a..d38c9fecdc36 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 - struct task_struct * task; + void *private; wait_queue_func_t func; struct list_head task_list; }; @@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t; */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ - .task = tsk, \ + .private = tsk, \ .func = default_wake_function, \ .task_list = { NULL, NULL } } @@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q) static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; - q->task = p; + q->private = p; q->func = default_wake_function; } @@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { q->flags = 0; - q->task = NULL; + q->private = NULL; q->func = func; } @@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q) * aio specifies a wait queue entry with an async notification * callback routine, not associated with any task. */ -#define is_sync_wait(wait) (!(wait) || ((wait)->task)) +#define is_sync_wait(wait) (!(wait) || ((wait)->private)) extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); @@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define DEFINE_WAIT(name) \ wait_queue_t name = { \ - .task = current, \ + .private = current, \ .func = autoremove_wake_function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ } @@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); struct wait_bit_queue name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wait = { \ - .task = current, \ + .private = current, \ .func = wake_bit_function, \ .task_list = \ LIST_HEAD_INIT((name).wait.task_list), \ @@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define init_wait(wait) \ do { \ - (wait)->task = current; \ + (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->task_list); \ } while (0) diff --git a/kernel/sched.c b/kernel/sched.c index 6ee4515d5a20..76080d142e3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2869,7 +2869,7 @@ asmlinkage void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - task_t *p = curr->task; + task_t *p = curr->private; return try_to_wake_up(p, mode, sync); } From 451512f3aed64573e912e68c94f240fec0e44438 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 23 Jun 2005 00:10:28 -0700 Subject: [PATCH 174/199] [PATCH] add note about verify_area removal to feature-removal-schedule.txt Add note about the soon-to-come removal of verify_area() to Documentation/feature-removal-schedule.txt. Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 26414bc87c65..e67c90d4ee17 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -66,6 +66,14 @@ Who: Paul E. McKenney --------------------------- +What: remove verify_area() +When: July 2006 +Files: Various uaccess.h headers. +Why: Deprecated and redundant. access_ok() should be used instead. +Who: Jesper Juhl + +--------------------------- + What: IEEE1394 Audio and Music Data Transmission Protocol driver, Connection Management Procedures driver When: November 2005 From 9235e68be8bf8974b65a9bf733c9d12a52307839 Mon Sep 17 00:00:00 2001 From: Eric Piel Date: Thu, 23 Jun 2005 00:10:29 -0700 Subject: [PATCH 175/199] [PATCH] IDE CD reports current speed The current ide-cd driver reports the CDROM speed (as found in /proc/sys/dev/cdrom/info) as the current speed when loading the driver. Changing the speed of the cdrom drive (by "eject -x" for instance) doesn't update the speed reported by the kernel. Updating the info could be valuable for the user as it's the only way to know if the drive accepted the request or discarded it. It could even be used to list all the available speeds of the drive. The attached patch modifies the ide-cd driver so that after every speed change request the new speed is updated. Please note that the actual modification is very little but I had to touch quite a few lines in order to avoid to pre-declare the sub-functions. Signed-off-by: Eric Piel Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/ide/ide-cd.c | 89 ++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 39f3e9101ed4..0a31cfda08a0 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -2656,17 +2656,64 @@ int ide_cdrom_lock_door (struct cdrom_device_info *cdi, int lock) return cdrom_lockdoor(drive, lock, NULL); } +static +int ide_cdrom_get_capabilities(ide_drive_t *drive, struct atapi_capabilities_page *cap) +{ + struct cdrom_info *info = drive->driver_data; + struct cdrom_device_info *cdi = &info->devinfo; + struct packet_command cgc; + int stat, attempts = 3, size = sizeof(*cap); + + /* + * ACER50 (and others?) require the full spec length mode sense + * page capabilities size, but older drives break. + */ + if (!(!strcmp(drive->id->model, "ATAPI CD ROM DRIVE 50X MAX") || + !strcmp(drive->id->model, "WPI CDS-32X"))) + size -= sizeof(cap->pad); + + init_cdrom_command(&cgc, cap, size, CGC_DATA_UNKNOWN); + do { /* we seem to get stat=0x01,err=0x00 the first time (??) */ + stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CAPABILITIES_PAGE, 0); + if (!stat) + break; + } while (--attempts); + return stat; +} + +static +void ide_cdrom_update_speed (ide_drive_t *drive, struct atapi_capabilities_page *cap) +{ + /* The ACER/AOpen 24X cdrom has the speed fields byte-swapped */ + if (!drive->id->model[0] && + !strncmp(drive->id->fw_rev, "241N", 4)) { + CDROM_STATE_FLAGS(drive)->current_speed = + (((unsigned int)cap->curspeed) + (176/2)) / 176; + CDROM_CONFIG_FLAGS(drive)->max_speed = + (((unsigned int)cap->maxspeed) + (176/2)) / 176; + } else { + CDROM_STATE_FLAGS(drive)->current_speed = + (ntohs(cap->curspeed) + (176/2)) / 176; + CDROM_CONFIG_FLAGS(drive)->max_speed = + (ntohs(cap->maxspeed) + (176/2)) / 176; + } +} + static int ide_cdrom_select_speed (struct cdrom_device_info *cdi, int speed) { ide_drive_t *drive = (ide_drive_t*) cdi->handle; struct request_sense sense; + struct atapi_capabilities_page cap; int stat; if ((stat = cdrom_select_speed(drive, speed, &sense)) < 0) return stat; - cdi->speed = CDROM_STATE_FLAGS(drive)->current_speed; + if (!ide_cdrom_get_capabilities(drive, &cap)) { + ide_cdrom_update_speed(drive, &cap); + cdi->speed = CDROM_STATE_FLAGS(drive)->current_speed; + } return 0; } @@ -2868,31 +2915,6 @@ static int ide_cdrom_register (ide_drive_t *drive, int nslots) return register_cdrom(devinfo); } -static -int ide_cdrom_get_capabilities(ide_drive_t *drive, struct atapi_capabilities_page *cap) -{ - struct cdrom_info *info = drive->driver_data; - struct cdrom_device_info *cdi = &info->devinfo; - struct packet_command cgc; - int stat, attempts = 3, size = sizeof(*cap); - - /* - * ACER50 (and others?) require the full spec length mode sense - * page capabilities size, but older drives break. - */ - if (!(!strcmp(drive->id->model, "ATAPI CD ROM DRIVE 50X MAX") || - !strcmp(drive->id->model, "WPI CDS-32X"))) - size -= sizeof(cap->pad); - - init_cdrom_command(&cgc, cap, size, CGC_DATA_UNKNOWN); - do { /* we seem to get stat=0x01,err=0x00 the first time (??) */ - stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (!stat) - break; - } while (--attempts); - return stat; -} - static int ide_cdrom_probe_capabilities (ide_drive_t *drive) { @@ -2978,20 +3000,7 @@ int ide_cdrom_probe_capabilities (ide_drive_t *drive) } } - /* The ACER/AOpen 24X cdrom has the speed fields byte-swapped */ - if (!drive->id->model[0] && - !strncmp(drive->id->fw_rev, "241N", 4)) { - CDROM_STATE_FLAGS(drive)->current_speed = - (((unsigned int)cap.curspeed) + (176/2)) / 176; - CDROM_CONFIG_FLAGS(drive)->max_speed = - (((unsigned int)cap.maxspeed) + (176/2)) / 176; - } else { - CDROM_STATE_FLAGS(drive)->current_speed = - (ntohs(cap.curspeed) + (176/2)) / 176; - CDROM_CONFIG_FLAGS(drive)->max_speed = - (ntohs(cap.maxspeed) + (176/2)) / 176; - } - + ide_cdrom_update_speed(drive, &cap); /* don't print speed if the drive reported 0. */ printk(KERN_INFO "%s: ATAPI", drive->name); From 790a19cd5711133f40daad7c55bf148de2b1d12c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jun 2005 00:10:31 -0700 Subject: [PATCH 176/199] [PATCH] pwc-uncompress warning fix drivers/usb/media/pwc/pwc-uncompress.c: In function `pwc_decompress': drivers/usb/media/pwc/pwc-uncompress.c:140: warning: unreachable code at beginning of switch statement Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/media/pwc/pwc-uncompress.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/usb/media/pwc/pwc-uncompress.c b/drivers/usb/media/pwc/pwc-uncompress.c index bc3b1635eab0..ef4204eab6c4 100644 --- a/drivers/usb/media/pwc/pwc-uncompress.c +++ b/drivers/usb/media/pwc/pwc-uncompress.c @@ -118,9 +118,9 @@ int pwc_decompress(struct pwc_device *pdev) return -ENXIO; /* No such device or address: missing decompressor */ } +#if 0 switch (pdev->type) { -#if 0 case 675: case 680: case 690: @@ -128,18 +128,17 @@ int pwc_decompress(struct pwc_device *pdev) case 730: case 740: case 750: - pwc_dec23_decompress(&pdev->image, &pdev->view, &pdev->offset, - yuv, image, - flags, + pwc_dec23_decompress(&pdev->image, &pdev->view, + &pdev->offset, yuv, image, flags, pdev->decompress_data, pdev->vbandlength); break; case 645: case 646: /* TODO & FIXME */ -#endif - return -ENXIO; /* No such device or address: missing decompressor */ + return -ENXIO; /* Missing decompressor */ break; } +#endif } return 0; } From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:10:32 -0700 Subject: [PATCH 177/199] [PATCH] Introduce tty_unregister_ldisc() It's a bit strange to see tty_register_ldisc call in modules' exit functions. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/tty.txt | 2 +- drivers/char/tty_io.c | 37 ++++++++++++++++++++++++------------- include/linux/tty.h | 1 + 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/Documentation/tty.txt b/Documentation/tty.txt index 3958cf746dde..8ff7bc2a0811 100644 --- a/Documentation/tty.txt +++ b/Documentation/tty.txt @@ -22,7 +22,7 @@ copy of the structure. You must not re-register over the top of the line discipline even with the same data or your computer again will be eaten by demons. -In order to remove a line discipline call tty_register_ldisc passing NULL. +In order to remove a line discipline call tty_unregister_ldisc(). In ancient times this always worked. In modern times the function will return -EBUSY if the ldisc is currently in use. Since the ldisc referencing code manages the module counts this should not usually be a concern. diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 31831030f73f..cc4b43bad703 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -251,7 +251,7 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) static DEFINE_SPINLOCK(tty_ldisc_lock); static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); -static struct tty_ldisc tty_ldiscs[NR_LDISCS]; /* line disc dispatch table */ +static struct tty_ldisc tty_ldiscs[NR_LDISCS]; /* line disc dispatch table */ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) { @@ -262,24 +262,35 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) return -EINVAL; spin_lock_irqsave(&tty_ldisc_lock, flags); - if (new_ldisc) { - tty_ldiscs[disc] = *new_ldisc; - tty_ldiscs[disc].num = disc; - tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED; - tty_ldiscs[disc].refcount = 0; - } else { - if(tty_ldiscs[disc].refcount) - ret = -EBUSY; - else - tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED; - } + tty_ldiscs[disc] = *new_ldisc; + tty_ldiscs[disc].num = disc; + tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED; + tty_ldiscs[disc].refcount = 0; spin_unlock_irqrestore(&tty_ldisc_lock, flags); return ret; } - EXPORT_SYMBOL(tty_register_ldisc); +int tty_unregister_ldisc(int disc) +{ + unsigned long flags; + int ret = 0; + + if (disc < N_TTY || disc >= NR_LDISCS) + return -EINVAL; + + spin_lock_irqsave(&tty_ldisc_lock, flags); + if (tty_ldiscs[disc].refcount) + ret = -EBUSY; + else + tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED; + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + + return ret; +} +EXPORT_SYMBOL(tty_unregister_ldisc); + struct tty_ldisc *tty_ldisc_get(int disc) { unsigned long flags; diff --git a/include/linux/tty.h b/include/linux/tty.h index 1b76106272d3..59ff42c629ec 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty); extern void stop_tty(struct tty_struct * tty); extern void start_tty(struct tty_struct * tty); extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc); +extern int tty_unregister_ldisc(int disc); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); From 64ccd715d3cf498318b14b646ce5f97e7ab15bb5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:10:33 -0700 Subject: [PATCH 178/199] [PATCH] Convert users to tty_unregister_ldisc() tty_register_ldisc(N_FOO, NULL) => tty_unregister_ldisc(N_FOO) Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/bluetooth/hci_ldisc.c | 2 +- drivers/char/n_hdlc.c | 2 +- drivers/char/n_r3964.c | 2 +- drivers/input/serio/serport.c | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/mkiss.c | 2 +- drivers/net/irda/irtty-sir.c | 2 +- drivers/net/ppp_async.c | 2 +- drivers/net/ppp_synctty.c | 2 +- drivers/net/slip.c | 2 +- drivers/net/wan/x25_asy.c | 2 +- drivers/net/wireless/strip.c | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 9075bbb56ad4..f766bc22c6bb 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -576,7 +576,7 @@ static void __exit hci_uart_exit(void) #endif /* Release tty registration of line discipline */ - if ((err = tty_register_ldisc(N_HCI, NULL))) + if ((err = tty_unregister_ldisc(N_HCI))) BT_ERR("Can't unregister HCI line discipline (%d)", err); } diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c index b3dbff1cf967..5079beda69b5 100644 --- a/drivers/char/n_hdlc.c +++ b/drivers/char/n_hdlc.c @@ -960,7 +960,7 @@ static char hdlc_unregister_fail[] __exitdata = static void __exit n_hdlc_exit(void) { /* Release tty registration of line discipline */ - int status = tty_register_ldisc(N_HDLC, NULL); + int status = tty_unregister_ldisc(N_HDLC); if (status) printk(hdlc_unregister_fail, status); diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c index 3883073ab48f..2291a87e8ada 100644 --- a/drivers/char/n_r3964.c +++ b/drivers/char/n_r3964.c @@ -200,7 +200,7 @@ static void __exit r3964_exit(void) TRACE_M ("cleanup_module()"); - status=tty_register_ldisc(N_R3964, NULL); + status=tty_unregister_ldisc(N_R3964); if(status!=0) { diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index f6b85222ba3d..79ca38469159 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -257,7 +257,7 @@ static int __init serport_init(void) static void __exit serport_exit(void) { - tty_register_ldisc(N_MOUSE, NULL); + tty_unregister_ldisc(N_MOUSE); } module_init(serport_init); diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 89454915b857..e44f8e9055ef 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -848,7 +848,7 @@ static void __exit sixpack_exit_driver(void) { int ret; - if ((ret = tty_register_ldisc(N_6PACK, NULL))) + if ((ret = tty_unregister_ldisc(N_6PACK))) printk(msg_unregfail, ret); } diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 62790511098f..3035422f5ad8 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -934,7 +934,7 @@ static void __exit mkiss_exit_driver(void) kfree(ax25_ctrls); ax25_ctrls = NULL; - if ((i = tty_register_ldisc(N_AX25, NULL))) + if ((i = tty_unregister_ldisc(N_AX25))) printk(KERN_ERR "mkiss: can't unregister line discipline (err = %d)\n", i); } diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c index 7d23aa375908..b8d112348ba4 100644 --- a/drivers/net/irda/irtty-sir.c +++ b/drivers/net/irda/irtty-sir.c @@ -626,7 +626,7 @@ static void __exit irtty_sir_cleanup(void) { int err; - if ((err = tty_register_ldisc(N_IRDA, NULL))) { + if ((err = tty_unregister_ldisc(N_IRDA))) { IRDA_ERROR("%s(), can't unregister line discipline (err = %d)\n", __FUNCTION__, err); } diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index 33b9d79b1aad..5e48b9ab3045 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -1025,7 +1025,7 @@ static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, static void __exit ppp_async_cleanup(void) { - if (tty_register_ldisc(N_PPP, NULL) != 0) + if (tty_unregister_ldisc(N_PPP) != 0) printk(KERN_ERR "failed to unregister PPP line discipline\n"); } diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c index 7d0150b4c629..fd9f50180355 100644 --- a/drivers/net/ppp_synctty.c +++ b/drivers/net/ppp_synctty.c @@ -793,7 +793,7 @@ ppp_sync_input(struct syncppp *ap, const unsigned char *buf, static void __exit ppp_sync_cleanup(void) { - if (tty_register_ldisc(N_SYNC_PPP, NULL) != 0) + if (tty_unregister_ldisc(N_SYNC_PPP) != 0) printk(KERN_ERR "failed to unregister Sync PPP line discipline\n"); } diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 8f7841c0374d..19112712daf0 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -1430,7 +1430,7 @@ static void __exit slip_exit(void) kfree(slip_devs); slip_devs = NULL; - if ((i = tty_register_ldisc(N_SLIP, NULL))) + if ((i = tty_unregister_ldisc(N_SLIP))) { printk(KERN_ERR "SLIP: can't unregister line discipline (err = %d)\n", i); } diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c index 1c540d825551..bdf672c48182 100644 --- a/drivers/net/wan/x25_asy.c +++ b/drivers/net/wan/x25_asy.c @@ -829,7 +829,7 @@ static void __exit exit_x25_asy(void) } kfree(x25_asy_devs); - tty_register_ldisc(N_X25, NULL); + tty_unregister_ldisc(N_X25); } module_init(init_x25_asy); diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c index ec8cf29ffced..6c42b573a95a 100644 --- a/drivers/net/wireless/strip.c +++ b/drivers/net/wireless/strip.c @@ -2828,7 +2828,7 @@ static void __exit strip_exit_driver(void) /* Unregister with the /proc/net file here. */ proc_net_remove("strip"); - if ((i = tty_register_ldisc(N_STRIP, NULL))) + if ((i = tty_unregister_ldisc(N_STRIP))) printk(KERN_ERR "STRIP: can't unregister line discipline (err = %d)\n", i); printk(signoff); From fed2fc18a4567d613cd35115322257c6c6c710e9 Mon Sep 17 00:00:00 2001 From: Telemaque Ndizihiwe Date: Thu, 23 Jun 2005 00:10:33 -0700 Subject: [PATCH 179/199] [PATCH] sys_open() cleanup Clean up tortured logic in sys_open(). Signed-off-by: Telemaque Ndizihiwe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/open.c b/fs/open.c index 5dd411b084bf..8ec63f735918 100644 --- a/fs/open.c +++ b/fs/open.c @@ -934,7 +934,7 @@ EXPORT_SYMBOL(fd_install); asmlinkage long sys_open(const char __user * filename, int flags, int mode) { char * tmp; - int fd, error; + int fd; if (force_o_largefile()) flags |= O_LARGEFILE; @@ -945,20 +945,16 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode) fd = get_unused_fd(); if (fd >= 0) { struct file *f = filp_open(tmp, flags, mode); - error = PTR_ERR(f); - if (IS_ERR(f)) - goto out_error; - fd_install(fd, f); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fd_install(fd, f); + } } -out: putname(tmp); } return fd; - -out_error: - put_unused_fd(fd); - fd = error; - goto out; } EXPORT_SYMBOL_GPL(sys_open); From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 23 Jun 2005 11:36:56 +0200 Subject: [PATCH 180/199] [PATCH] better USB_MON dependencies This makes the USB_MON less confusing. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- drivers/usb/core/hcd.c | 2 +- drivers/usb/core/hcd.h | 2 +- drivers/usb/mon/Kconfig | 13 ++++--------- drivers/usb/mon/Makefile | 2 +- include/linux/usb.h | 2 +- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index d041782e0c8b..0da23732e807 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1794,7 +1794,7 @@ EXPORT_SYMBOL (usb_remove_hcd); /*-------------------------------------------------------------------------*/ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) +#if defined(CONFIG_USB_MON) struct usb_mon_operations *mon_ops; diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h index f67cf1e634fc..325a51656c3f 100644 --- a/drivers/usb/core/hcd.h +++ b/drivers/usb/core/hcd.h @@ -399,7 +399,7 @@ static inline void usbfs_cleanup(void) { } /*-------------------------------------------------------------------------*/ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) +#if defined(CONFIG_USB_MON) struct usb_mon_operations { void (*urb_submit)(struct usb_bus *bus, struct urb *urb); diff --git a/drivers/usb/mon/Kconfig b/drivers/usb/mon/Kconfig index 4e6152aa5f19..777642e26b9a 100644 --- a/drivers/usb/mon/Kconfig +++ b/drivers/usb/mon/Kconfig @@ -2,13 +2,9 @@ # USB Monitor configuration # -# In normal life, it makes little sense to have usbmon as a module, and in fact -# it is harmful, because there is no way to autoload the module. -# The 'm' option is allowed for hackers who debug the usbmon itself, -# and for those who have usbcore as a module. config USB_MON - tristate "USB Monitor" - depends on USB + bool "USB Monitor" + depends on USB!=n default y help If you say Y here, a component which captures the USB traffic @@ -17,6 +13,5 @@ config USB_MON Harding's USBMon. This is somewhat experimental at this time, but it should be safe, - as long as you aren't building this as a module and then removing it. - - If unsure, say Y. Do not say M. + as long as you aren't using modular USB and try to remove this + module. diff --git a/drivers/usb/mon/Makefile b/drivers/usb/mon/Makefile index 3cff8d444bb1..f18d10ce91f9 100644 --- a/drivers/usb/mon/Makefile +++ b/drivers/usb/mon/Makefile @@ -4,4 +4,4 @@ usbmon-objs := mon_main.o mon_stat.o mon_text.o -obj-$(CONFIG_USB_MON) += usbmon.o +obj-$(CONFIG_USB) += usbmon.o diff --git a/include/linux/usb.h b/include/linux/usb.h index 3d508bf08402..eb282b581546 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -290,7 +290,7 @@ struct usb_bus { struct class_device *class_dev; /* class device for this bus */ struct kref kref; /* handles reference counting this bus */ void (*release)(struct usb_bus *bus); /* function to destroy this bus's memory */ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) +#if defined(CONFIG_USB_MON) struct mon_bus *mon_bus; /* non-null when associated */ int monitored; /* non-zero when monitored */ #endif From a8ad86f2dc46356f87be1327dabc18bdbda32f50 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Jun 2005 14:23:35 +0100 Subject: [PATCH 181/199] [PATCH] Add removal schedule of register_serial/unregister_serial to appropriate file --- Documentation/feature-removal-schedule.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index e67c90d4ee17..77511af45362 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -94,6 +94,16 @@ Who: Jody McIntyre --------------------------- +What: register_serial/unregister_serial +When: December 2005 +Why: This interface does not allow serial ports to be registered against + a struct device, and as such does not allow correct power management + of such ports. 8250-based ports should use serial8250_register_port + and serial8250_unregister_port instead. +Who: Russell King + +--------------------------- + What: i2c sysfs name change: in1_ref, vid deprecated in favour of cpu0_vid When: November 2005 Files: drivers/i2c/chips/adm1025.c, drivers/i2c/chips/adm1026.c From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:19:55 -0700 Subject: [PATCH 182/199] [TCP]: Add pluggable congestion control algorithm infrastructure. Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/sysctl.h | 9 +- include/linux/tcp.h | 49 +-- include/net/tcp.h | 237 ++++-------- net/ipv4/Makefile | 3 +- net/ipv4/sysctl_net_ipv4.c | 114 +++--- net/ipv4/tcp.c | 2 + net/ipv4/tcp_cong.c | 195 ++++++++++ net/ipv4/tcp_diag.c | 20 +- net/ipv4/tcp_input.c | 737 +++---------------------------------- net/ipv4/tcp_ipv4.c | 3 + net/ipv4/tcp_minisocks.c | 4 +- net/ipv4/tcp_output.c | 23 +- net/ipv6/tcp_ipv6.c | 2 +- 13 files changed, 399 insertions(+), 999 deletions(-) create mode 100644 net/ipv4/tcp_cong.c diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a4..72965bfe6cfb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df5..3ea75dd6640a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824a..e427cf35915c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_MAX_INCREMENT 32 /* - * Limit on the amount of - * increment allowed during - * binary search. - */ -#define BICTCP_FUNC_OF_MIN_INCR 11 /* - * log(B/Smin)/log(B/(B-1))+1, - * Smin:min increment - * B:log factor - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - /* * TCP option */ @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; -extern int sysctl_tcp_vegas_alpha; -extern int sysctl_tcp_vegas_beta; -extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; -extern int sysctl_tcp_bic_fast_convergence; -extern int sysctl_tcp_bic_low_window; -extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { + struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct tcp_sock *tp); + /* cleanup private data (optional) */ + void (*release)(struct tcp_sock *tp); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct tcp_sock *tp); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct tcp_sock *tp); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct tcp_sock *tp, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct tcp_sock *tp); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + /* get info for tcp_diag (optional) */ + void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); +extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); + +extern void tcp_init_congestion_control(struct tcp_sock *tp); +extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern int tcp_set_default_congestion_control(const char *name); +extern void tcp_get_default_congestion_control(char *name); + +extern struct tcp_congestion_ops tcp_reno; +extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); +extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int flag); +extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tp->ca_ops->set_state) + tp->ca_ops->set_state(tp, ca_state); + tp->ca_state = ca_state; +} + +static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (tp->ca_ops->cwnd_event) + tp->ca_ops->cwnd_event(tp, event); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } -/* - * Which congestion algorithim is in use on the connection. - */ -#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) -#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) -#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) - -/* Recalculate snd_ssthresh, we want to set it to: - * - * Reno: - * one half the current congestion window, but no - * less than two segments - * - * BIC: - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) -{ - if (tcp_is_bic(tp)) { - if (sysctl_tcp_bic_fast_convergence && - tp->snd_cwnd < tp->bictcp.last_max_cwnd) - tp->bictcp.last_max_cwnd = (tp->snd_cwnd * - (BICTCP_BETA_SCALE - + sysctl_tcp_bic_beta)) - / (2 * BICTCP_BETA_SCALE); - else - tp->bictcp.last_max_cwnd = tp->snd_cwnd; - - if (tp->snd_cwnd > sysctl_tcp_bic_low_window) - return max((tp->snd_cwnd * sysctl_tcp_bic_beta) - / BICTCP_BETA_SCALE, 2U); - } - - return max(tp->snd_cwnd >> 1U, 2U); -} - -/* Stop taking Vegas samples for now. */ -#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) - -static inline void tcp_vegas_enable(struct tcp_sock *tp) -{ - /* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ - - /* Begin taking Vegas samples next time we send something. */ - tp->vegas.doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - tp->vegas.beg_snd_nxt = tp->snd_nxt; - - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; -} - -/* Should we be taking Vegas samples right now? */ -#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) - -extern void tcp_ca_init(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tcp_is_vegas(tp)) { - if (ca_state == TCP_CA_Open) - tcp_vegas_enable(tp); - else - tcp_vegas_disable(tp); - } - tp->ca_state = ca_state; -} - /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1876,52 +1837,4 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); -/* TCP Westwood functions and constants */ - -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ - -static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) -{ - if (tcp_is_westwood(tp)) - tp->westwood.rtt = rtt_seq; -} - -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), - 2U); -} - -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - -static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) -{ - __u32 ssthresh = 0; - - if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); - if (ssthresh) - tp->snd_ssthresh = ssthresh; - } - - return (ssthresh != 0); -} - -static inline int tcp_westwood_cwnd(struct tcp_sock *tp) -{ - __u32 cwnd = 0; - - if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); - if (cwnd) - tp->snd_cwnd = cwnd; - } - - return (cwnd != 0); -} #endif /* _TCP_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1add..89c0b4cb470e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,8 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0b..e32894532416 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, + context); + if (ret == 0 && newval && newlen) + ret = tcp_set_default_congestion_control(val); + return ret; +} + + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -611,70 +650,6 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_ALPHA, - .procname = "tcp_vegas_alpha", - .data = &sysctl_tcp_vegas_alpha, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_BETA, - .procname = "tcp_vegas_beta", - .data = &sysctl_tcp_vegas_beta, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_GAMMA, - .procname = "tcp_vegas_gamma", - .data = &sysctl_tcp_vegas_gamma, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, - .procname = "tcp_bic_fast_convergence", - .data = &sysctl_tcp_bic_fast_convergence, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_LOW_WINDOW, - .procname = "tcp_bic_low_window", - .data = &sysctl_tcp_bic_low_window, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC_BETA, - .procname = "tcp_bic_beta", - .data = &sysctl_tcp_bic_beta, - .maxlen = sizeof(int), + .ctl_name = NET_TCP_CONG_CONTROL, + .procname = "tcp_congestion_control", .mode = 0644, - .proc_handler = &proc_dointvec, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = &proc_tcp_congestion_control, + .strategy = &sysctl_tcp_congestion_control, }, + { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd36..f3dbc8dc1263 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2333,6 +2333,8 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); + + tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 000000000000..665394a63ae4 --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,195 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct tcp_sock *tp) +{ + struct tcp_congestion_ops *ca; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + tp->ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (tp->ca_ops->init) + tp->ca_ops->init(tp); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct tcp_sock *tp) +{ + if (tp->ca_ops->release) + tp->ca_ops->release(tp); + module_put(tp->ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +EXPORT_SYMBOL_GPL(tcp_reno); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc07921..867acc0f79d8 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -42,7 +42,6 @@ struct tcpdiag_entry static struct sock *tcpnl; - #define TCPDIAG_PUT(skb, attrtype, attrlen) \ ({ int rtalen = RTA_LENGTH(attrlen); \ struct rtattr *rta; \ @@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; - struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) - && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) - vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (vinfo) { - if (tcp_is_vegas(tp)) { - vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; - vinfo->tcpv_rttcnt = tp->vegas.cntRTT; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); - } else { - vinfo->tcpv_enabled = 0; - vinfo->tcpv_rttcnt = 0; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); - } - } + if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) + tp->ca_ops->get_info(tp, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,7 +61,6 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs - * Angelo Dell'Aera: TCP Westwood+ support */ #include @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -int sysctl_tcp_vegas_alpha = 1<snd_cwnd_stamp = tcp_time_stamp; } -static void init_bictcp(struct tcp_sock *tp) -{ - tp->bictcp.cnt = 0; - - tp->bictcp.last_max_cwnd = 0; - tp->bictcp.last_cwnd = 0; - tp->bictcp.last_stamp = 0; -} - /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } -/* When starting a new connection, pin down the current choice of - * congestion algorithm. - */ -void tcp_ca_init(struct tcp_sock *tp) -{ - if (sysctl_tcp_westwood) - tp->adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) -{ - __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < tp->vegas.baseRTT) - tp->vegas.baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); - tp->vegas.cntRTT++; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) { long m = mrtt; /* RTT */ - if (tcp_vegas_enabled(tp)) - vegas_rtt_calc(tp, mrtt); - /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) tp->rtt_seq = tp->snd_nxt; } - tcp_westwood_update_rtt(tp, tp->srtt >> 3); + if (tp->ca_ops->rtt_sample) + tp->ca_ops->rtt_sample(tp, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - if (!tcp_westwood_ssthresh(tp)) - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - - init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ - static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; - __u32 limit; - - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ - - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > limit) + if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_sock *tp, int undo) { if (tp->prior_ssthresh) { - if (tcp_is_bic(tp)) - tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + if (tp->ca_ops->undo_cwnd) + tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) static inline void tcp_complete_cwr(struct tcp_sock *tp) { - if (tcp_westwood_cwnd(tp)) - tp->snd_ssthresh = tp->snd_cwnd; - else - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; + tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); TCP_ECN_queue_cwr(tp); } @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) { __u32 seq_rtt; @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt) + int flag, s32 seq_rtt, u32 *usrtt) { /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, flag); + tcp_ack_saw_tstamp(tp, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, flag); + tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); } -/* - * Compute congestion window to use. - * - * This is from the implementation of BICTCP in - * Lison-Xu, Kahaled Harfoush, and Injog Rhee. - * "Binary Increase Congestion Control for Fast, Long Distance - * Networks" in InfoComm 2004 - * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf - * - * Unless BIC is enabled and congestion window is large - * this behaves the same as the original Reno. - */ -static inline __u32 bictcp_cwnd(struct tcp_sock *tp) -{ - /* orignal Reno behaviour */ - if (!tcp_is_bic(tp)) - return tp->snd_cwnd; - - if (tp->bictcp.last_cwnd == tp->snd_cwnd && - (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) - return tp->bictcp.cnt; - - tp->bictcp.last_cwnd = tp->snd_cwnd; - tp->bictcp.last_stamp = tcp_time_stamp; - - /* start off normal */ - if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) - tp->bictcp.cnt = tp->snd_cwnd; - - /* binary increase */ - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { - __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) - / BICTCP_B; - - if (dist > BICTCP_MAX_INCREMENT) - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - else if (dist <= 1U) - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd / dist; - } else { - /* slow start amd linear increase */ - if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd - + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) - / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); - else - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - } - return tp->bictcp.cnt; -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static inline void reno_cong_avoid(struct tcp_sock *tp) +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int good) { - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } + tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); tp->snd_cwnd_stamp = tcp_time_stamp; } -/* This is based on the congestion detection/avoidance scheme described in - * Lawrence S. Brakmo and Larry L. Peterson. - * "TCP Vegas: End to end congestion avoidance on a global internet." - * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, - * October 1995. Available from: - * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps - * - * See http://www.cs.arizona.edu/xkernel/ for their implementation. - * The main aspects that distinguish this implementation from the - * Arizona Vegas implementation are: - * o We do not change the loss detection or recovery mechanisms of - * Linux in any way. Linux already recovers from losses quite well, - * using fine-grained timers, NewReno, and FACK. - * o To avoid the performance penalty imposed by increasing cwnd - * only every-other RTT during slow start, we increase during - * every RTT during slow start, just like Reno. - * o Largely to allow continuous cwnd growth during slow start, - * we use the rate at which ACKs come back as the "actual" - * rate, rather than the rate at which data is sent. - * o To speed convergence to the right rate, we set the cwnd - * to achieve the right ("actual") rate when we exit slow start. - * o To filter out the noise caused by delayed ACKs, we use the - * minimum RTT sample observed during the last RTT to calculate - * the actual rate. - * o When the sender re-starts from idle, it waits until it has - * received ACKs for an entire flight of new data before making - * a cwnd adjustment decision. The original Vegas implementation - * assumed senders never went idle. - */ -static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - /* The key players are v_beg_snd_una and v_beg_snd_nxt. - * - * These are so named because they represent the approximate values - * of snd_una and snd_nxt at the beginning of the current RTT. More - * precisely, they represent the amount of data sent during the RTT. - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding - * bytes of data have been ACKed during the course of the RTT, giving - * an "actual" rate of: - * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) - * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, - * because delayed ACKs can cover more than one segment, so they - * don't line up nicely with the boundaries of RTTs. - * - * Another unfortunate fact of life is that delayed ACKs delay the - * advance of the left edge of our send window, so that the number - * of bytes we send in an RTT is often less than our cwnd will allow. - * So we keep track of our cwnd separately, in v_beg_snd_cwnd. - */ - - if (after(ack, tp->vegas.beg_snd_nxt)) { - /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache_std; - old_snd_cwnd = tp->vegas.beg_snd_cwnd; - - /* Save the extent of the current window so we can use this - * at the end of the next RTT. - */ - tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; - tp->vegas.beg_snd_nxt = tp->snd_nxt; - tp->vegas.beg_snd_cwnd = tp->snd_cwnd; - - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - vegas_rtt_calc(tp, seq_rtt); - - /* We do the Vegas calculations only if we got enough RTT - * samples that we can be reasonably sure that we got - * at least one RTT sample that wasn't from a delayed ACK. - * If we only had 2 samples total, - * then that means we're getting only 1 ACK per RTT, which - * means they're almost certainly delayed ACKs. - * If we have 3 samples, we should be OK. - */ - - if (tp->vegas.cntRTT <= 2) { - /* We don't have enough RTT samples to do the Vegas - * calculation, so we'll behave like Reno. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; - } else { - u32 rtt, target_cwnd, diff; - - /* We have enough RTT samples, so, using the Vegas - * algorithm, we determine if we should increase or - * decrease cwnd, and by how much. - */ - - /* Pluck out the RTT we are using for the Vegas - * calculations. This is the min RTT seen during the - * last RTT. Taking the min filters out the effects - * of delayed ACKs, at the cost of noticing congestion - * a bit later. - */ - rtt = tp->vegas.minRTT; - - /* Calculate the cwnd we should have, if we weren't - * going too fast. - * - * This is: - * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. - */ - target_cwnd = ((old_wnd * tp->vegas.baseRTT) - << V_PARAM_SHIFT) / rtt; - - /* Calculate the difference between the window we had, - * and the window we would like to have. This quantity - * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. - */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* Slow start. */ - if (diff > sysctl_tcp_vegas_gamma) { - /* Going too fast. Time to slow down - * and switch to congestion avoidance. - */ - tp->snd_ssthresh = 2; - - /* Set cwnd to match the actual rate - * exactly: - * cwnd = (actual rate) * baseRTT - * Then we add 1 because the integer - * truncation robs us of full link - * utilization. - */ - tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> - V_PARAM_SHIFT)+1); - - } - } else { - /* Congestion avoidance. */ - u32 next_snd_cwnd; - - /* Figure out where we would like cwnd - * to be. - */ - if (diff > sysctl_tcp_vegas_beta) { - /* The old window was too fast, so - * we slow down. - */ - next_snd_cwnd = old_snd_cwnd - 1; - } else if (diff < sysctl_tcp_vegas_alpha) { - /* We don't have enough extra packets - * in the network, so speed up. - */ - next_snd_cwnd = old_snd_cwnd + 1; - } else { - /* Sending just as fast as we - * should be. - */ - next_snd_cwnd = old_snd_cwnd; - } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; - } - } - - /* Wipe the slate clean for the next RTT. */ - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; - } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); - - tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - if (tcp_vegas_enabled(tp)) - vegas_cong_avoid(tp, ack, seq_rtt); - else - reno_cong_avoid(tp); -} - /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; + struct timeval usnow; + u32 pkts_acked = 0; + + if (seq_usrtt) + do_gettimeofday(&usnow); while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; + ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; + if (seq_usrtt) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + + (usnow.tv_usec - skb->stamp.tv_usec); + if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); + + if (tp->ca_ops->pkts_acked) + tp->ca_ops->pkts_acked(tp, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) tp->frto_counter = (tp->frto_counter + 1) % 3; } -/* - * TCP Westwood+ - */ - -/* - * @init_westwood - * This function initializes fields used in TCP Westwood+. We can't - * get no information about RTTmin at this time so we simply set it to - * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative - * since in this way we're sure it will be updated in a consistent - * way as soon as possible. It will reasonably happen within the first - * RTT period of the connection lifetime. - */ - -static void init_westwood(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = 0; - tp->westwood.bw_est = 0; - tp->westwood.accounted = 0; - tp->westwood.cumul_ack = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; - tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; - tp->westwood.snd_una = tp->snd_una; -} - -/* - * @westwood_do_filter - * Low-pass filter. Implemented using constant coeffients. - */ - -static inline __u32 westwood_do_filter(__u32 a, __u32 b) -{ - return (((7 * a) + b) >> 3); -} - -static void westwood_filter(struct sock *sk, __u32 delta) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = - westwood_do_filter(tp->westwood.bw_ns_est, - tp->westwood.bk / delta); - tp->westwood.bw_est = - westwood_do_filter(tp->westwood.bw_est, - tp->westwood.bw_ns_est); -} - -/* - * @westwood_update_rttmin - * It is used to update RTTmin. In this case we MUST NOT use - * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! - */ - -static inline __u32 westwood_update_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 rttmin = tp->westwood.rtt_min; - - if (tp->westwood.rtt != 0 && - (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) - rttmin = tp->westwood.rtt; - - return rttmin; -} - -/* - * @westwood_acked - * Evaluate increases for dk. - */ - -static inline __u32 westwood_acked(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->snd_una - tp->westwood.snd_una; -} - -/* - * @westwood_new_window - * It evaluates if we are receiving data inside the same RTT window as - * when we started. - * Return value: - * It returns 0 if we are still evaluating samples in the same RTT - * window, 1 if the sample has to be considered in the next window. - */ - -static int westwood_new_window(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 left_bound; - __u32 rtt; - int ret = 0; - - left_bound = tp->westwood.rtt_win_sx; - rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); - - /* - * A RTT-window has passed. Be careful since if RTT is less than - * 50ms we don't filter but we continue 'building the sample'. - * This minimum limit was choosen since an estimation on small - * time intervals is better to avoid... - * Obvioulsy on a LAN we reasonably will always have - * right_bound = left_bound + WESTWOOD_RTT_MIN - */ - - if ((left_bound + rtt) < tcp_time_stamp) - ret = 1; - - return ret; -} - -/* - * @westwood_update_window - * It updates RTT evaluation window if it is the right moment to do - * it. If so it calls filter for evaluating bandwidth. - */ - -static void __westwood_update_window(struct sock *sk, __u32 now) -{ - struct tcp_sock *tp = tcp_sk(sk); - __u32 delta = now - tp->westwood.rtt_win_sx; - - if (delta) { - if (tp->westwood.rtt) - westwood_filter(sk, delta); - - tp->westwood.bk = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - } -} - - -static void westwood_update_window(struct sock *sk, __u32 now) -{ - if (westwood_new_window(sk)) - __westwood_update_window(sk, now); -} - -/* - * @__tcp_westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successfull. In such case infact update is - * straight forward and doesn't need any particular care. - */ - -static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked(sk); - tp->westwood.snd_una = tp->snd_una; - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_fast_bw(sk, skb); -} - - -/* - * @westwood_dupack_update - * It updates accounted and cumul_ack when receiving a dupack. - */ - -static void westwood_dupack_update(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.accounted += tp->mss_cache_std; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline int westwood_may_change_cumul(struct tcp_sock *tp) -{ - return (tp->westwood.cumul_ack > tp->mss_cache_std); -} - -static inline void westwood_partial_update(struct tcp_sock *tp) -{ - tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline void westwood_complete_update(struct tcp_sock *tp) -{ - tp->westwood.cumul_ack -= tp->westwood.accounted; - tp->westwood.accounted = 0; -} - -/* - * @westwood_acked_count - * This function evaluates cumul_ack for evaluating dk in case of - * delayed or partial acks. - */ - -static inline __u32 westwood_acked_count(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.cumul_ack = westwood_acked(sk); - - /* If cumul_ack is 0 this is a dupack since it's not moving - * tp->snd_una. - */ - if (!(tp->westwood.cumul_ack)) - westwood_dupack_update(sk); - - if (westwood_may_change_cumul(tp)) { - /* Partial or delayed ack */ - if (tp->westwood.accounted >= tp->westwood.cumul_ack) - westwood_partial_update(tp); - else - westwood_complete_update(tp); - } - - tp->westwood.snd_una = tp->snd_una; - - return tp->westwood.cumul_ack; -} - - -/* - * @__tcp_westwood_slow_bw - * It is called when something is going wrong..even if there could - * be no problems! Infact a simple delayed packet may trigger a - * dupack. But we need to be careful in such case. - */ - -static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked_count(sk); - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_slow_bw(sk, skb); -} - /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; + s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; - tcp_westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; + tcp_ca_event(tp, CA_EVENT_FAST_ACK); + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_westwood_slow_bw(sk,skb); + tcp_ca_event(tp, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, + tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(tp->af_specific->conn_request(sk, skb) < 0) return 1; - init_westwood(sk); - init_bictcp(tp); - /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - init_westwood(sk); - init_bictcp(tp); - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0); + tcp_ack_saw_tstamp(tp, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad19..9122814c13ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; + tp->ca_ops = &tcp_reno; sk->sk_state = TCP_CLOSE; @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); + tcp_cleanup_congestion_control(tp); + /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f3..f42a284164b7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; + newtp->ca_ops = &tcp_reno; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->ecn_flags&TCP_ECN_OK) sock_set_flag(newsk, SOCK_NO_LARGESEND); - tcp_ca_init(newtp); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e337..0e17c244875c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - if (tcp_is_vegas(tp)) - tcp_vegas_enable(tp); + tcp_ca_event(tp, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 + /* If congestion control is doing timestamping */ + if (tp->ca_ops->rtt_sample) + do_gettimeofday(&skb->stamp); + sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - /* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ - if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) - tcp_vegas_enable(tp); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(tp, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->stamp = skb->stamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a83..fce56039b0e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - + tp->ca_ops = &tcp_reno; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); From 7c99c909fa69a183c1b80bd64fb9f0d11459aff3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:20:36 -0700 Subject: [PATCH 183/199] [TCP]: Change tcp_diag to use the existing __RTA_PUT() macro. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_diag.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 867acc0f79d8..a4e512036d88 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -43,13 +43,7 @@ struct tcpdiag_entry static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ -({ int rtalen = RTA_LENGTH(attrlen); \ - struct rtattr *rta; \ - if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ - rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ - rta->rta_type = attrtype; \ - rta->rta_len = rtalen; \ - RTA_DATA(rta); }) + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) @@ -167,6 +161,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, nlh->nlmsg_len = skb->tail - b; return skb->len; +rtattr_failure: nlmsg_failure: skb_trim(skb, b - skb->data); return -1; From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:21:28 -0700 Subject: [PATCH 184/199] [TCP]: Report congestion control algorithm in tcp_diag. Enhancement to the tcp_diag interface used by the iproute2 ss command to report the tcp congestion control being used by a socket. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tcp_diag.h | 4 ++-- net/ipv4/tcp_diag.c | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d15..7a5996743946 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -99,9 +99,10 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, + TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_VEGASINFO +#define TCPDIAG_MAX TCPDIAG_CONG /* TCPDIAG_MEM */ @@ -123,5 +124,4 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; - #endif /* _TCP_DIAG_H_ */ diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a4e512036d88..f66945cb158f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -65,6 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + if (ext & (1<<(TCPDIAG_CONG-1))) { + size_t len = strlen(tp->ca_ops->name); + strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + tp->ca_ops->name); + } } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; From 9d7bcfc6b8586ee5a52043f061e0411965e71b88 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:22:36 -0700 Subject: [PATCH 185/199] [TCP]: Update sysctl and congestion control documentation. Update the documentation to remove the old sysctl values and include the new congestion control infrastructure. Includes changes to tcp.txt by Ian McDonald. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 56 ++------------------- Documentation/networking/tcp.txt | 69 +++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 52 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2c893a7475d..ab65714d95fc 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN changed would be a Beowulf compute cluster. Default: 0 -tcp_westwood - BOOLEAN - Enable TCP Westwood+ congestion control algorithm. - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. - Default: 0 - -tcp_vegas_cong_avoid - BOOLEAN - Enable TCP Vegas congestion avoidance algorithm. - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. - Default:0 - -tcp_bic - BOOLEAN - Enable BIC TCP congestion control algorithm. - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - Default: 0 - -tcp_bic_low_window - INTEGER - Sets the threshold window (in packets) where BIC TCP starts to - adjust the congestion window. Below this threshold BIC TCP behaves - the same as the default TCP Reno. - Default: 14 - -tcp_bic_fast_convergence - BOOLEAN - Forces BIC TCP to more quickly respond to changes in congestion - window. Allows two flows sharing the same connection to converge - more rapidly. - Default: 1 - -tcp_default_win_scale - INTEGER - Sets the minimum window scale TCP will negotiate for on all - conections. - Default: 7 - tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. @@ -368,6 +317,11 @@ tcp_frto - BOOLEAN where packet loss is typically due to random radio interference rather than intermediate router congestion. +tcp_congestion_control - STRING + Set the congestion control algorithm to be used for new + connections. The algorithm "reno" is always available, but + additional choices may be available based on kernel configuration. + somaxconn - INTEGER Limit of socket listen() backlog, known in userspace as SOMAXCONN. Defaults to 128. See also tcp_max_syn_backlog for additional tuning diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt index 71749007091e..0fa300425575 100644 --- a/Documentation/networking/tcp.txt +++ b/Documentation/networking/tcp.txt @@ -1,5 +1,72 @@ -How the new TCP output machine [nyi] works. +TCP protocol +============ + +Last updated: 21 June 2005 + +Contents +======== + +- Congestion control +- How the new TCP output machine [nyi] works + +Congestion control +================== + +The following variables are used in the tcp_sock for congestion control: +snd_cwnd The size of the congestion window +snd_ssthresh Slow start threshold. We are in slow start if + snd_cwnd is less than this. +snd_cwnd_cnt A counter used to slow down the rate of increase + once we exceed slow start threshold. +snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to. +snd_cwnd_stamp Timestamp for when congestion window last validated. +snd_cwnd_used Used as a highwater mark for how much of the + congestion window is in use. It is used to adjust + snd_cwnd down when the link is limited by the + application rather than the network. + +As of 2.6.13, Linux supports pluggable congestion control algorithms. +A congestion control mechanism can be registered through functions in +tcp_cong.c. The functions used by the congestion control mechanism are +registered via passing a tcp_congestion_ops struct to +tcp_register_congestion_control. As a minimum name, ssthresh, +cong_avoid, min_cwnd must be valid. +Private data for a congestion control mechanism is stored in tp->ca_priv. +tcp_ca(tp) returns a pointer to this space. This is preallocated space - it +is important to check the size of your private data will fit this space, or +alternatively space could be allocated elsewhere and a pointer to it could +be stored here. + +There are three kinds of congestion control algorithms currently: The +simplest ones are derived from TCP reno (highspeed, scalable) and just +provide an alternative the congestion window calculation. More complex +ones like BIC try to look at other events to provide better +heuristics. There are also round trip time based algorithms like +Vegas and Westwood+. + +Good TCP congestion control is a complex problem because the algorithm +needs to maintain fairness and performance. Please review current +research and RFC's before developing new modules. + +The method that is used to determine which congestion control mechanism is +determined by the setting of the sysctl net.ipv4.tcp_congestion_control. +The default congestion control will be the last one registered (LIFO); +so if you built everything as modules. the default will be reno. If you +build with the default's from Kconfig, then BIC will be builtin (not a module) +and it will end up the default. + +If you really want a particular default value then you will need +to set it with the sysctl. If you use a sysctl, the module will be autoloaded +if needed and you will get the expected protocol. If you ask for an +unknown congestion method, then the sysctl attempt will fail. + +If you remove a tcp congestion control module, then you will get the next +available one. Since reno can not be built as a module, and can not be +deleted, it will always be available. + +How the new TCP output machine [nyi] works. +=========================================== Data is kept on a single queue. The skb->users flag tells us if the frame is one that has been queued already. To add a frame we throw it on the end. Ack From 83803034f4233d810c4adc52008921da060c55d1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:23:25 -0700 Subject: [PATCH 186/199] [TCP]: Add TCP BIC congestion control module. TCP BIC congestion control reworked to use the new congestion control infrastructure. This version is more up to date than the BIC code in 2.6.12; it incorporates enhancements from BICTCP 1.1, to handle low latency links. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 21 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_bic.c | 331 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100644 net/ipv4/tcp_bic.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c349..712ebacacb62 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -433,5 +433,26 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +# TCP Reno is builtin (required as fallback) +menu "TCP congestion control" + depends on INET + +config TCP_CONG_BIC + tristate "Binary Increase Congestion (BIC) control" + depends on INET + default y + ---help--- + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + +endmenu + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89c0b4cb470e..1d1cac5ac06a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 000000000000..ec38d45d6649 --- /dev/null +++ b/net/ipv4/tcp_bic.c @@ -0,0 +1,331 @@ +/* + * Binary Increase Congestion control for TCP + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injong Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include +#include +#include +#include + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 32; +static int low_window = 14; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int low_utilization_threshold = 153; +static int low_utilization_period = 2; +static int initial_ssthresh = 100; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(low_utilization_threshold, int, 0644); +MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); +module_param(low_utilization_period, int, 0644); +MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 delay_min; /* min delay */ + u32 delay_max; /* max delay */ + u32 last_delay; + u8 low_utilization;/* 0: high; 1: low */ + u32 low_utilization_start; /* starting time of low utilization detection*/ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->delay_min = 0; + ca->delay_max = 0; + ca->last_delay = 0; + ca->low_utilization = 0; + ca->low_utilization_start = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct tcp_sock *tp) +{ + bictcp_reset(tcp_ca(tp)); + if (initial_ssthresh) + tp->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / cwnd-ca->last_max_cwnd; + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if ( ca->loss_cwnd == 0 || + (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Detect low utilization in congestion avoidance */ +static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +{ + struct bictcp *ca = tcp_ca(tp); + u32 dist, delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + tcp_time_stamp < ca->epoch_start + HZ || + /* this delay samples may not be accurate */ + flag == 0) { + ca->last_delay = 0; + goto notlow; + } + + delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ + ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) /* no previous delay sample */ + goto notlow; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) { + ca->delay_min = ca->delay_max = delay; + goto notlow; + } + + if (ca->delay_max < delay) + ca->delay_max = delay; + + /* utilization is low, if avg delay < dist*threshold + for checking_period time */ + dist = ca->delay_max - ca->delay_min; + if (dist <= ca->delay_min>>6 || + tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) + goto notlow; + + if (ca->low_utilization_start == 0) { + ca->low_utilization = 0; + ca->low_utilization_start = tcp_time_stamp; + } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) + > low_utilization_period*HZ) { + ca->low_utilization = 1; + } + + return; + + notlow: + ca->low_utilization = 0; + ca->low_utilization_start = 0; + +} + +static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct bictcp *ca = tcp_ca(tp); + + bictcp_low_utilization(tp, data_acked); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + ca->epoch_start = 0; /* end of epoch */ + + /* in case of wrong delay_max*/ + if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) + ca->delay_max = ca->delay_min + + ((ca->delay_max - ca->delay_min)* 90) / 100; + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + +static void bictcp_state(struct tcp_sock *tp, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(tcp_ca(tp)); +} + +/* Track delayed acknowledgement ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +{ + if (cnt > 0 && tp->ca_state == TCP_CA_Open) { + struct bictcp *ca = tcp_ca(tp); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); From 8727076289ec55298a05cabddf02b374d13c1624 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:24:09 -0700 Subject: [PATCH 187/199] [TCP]: Add TCP Westwood congestion control module. This is the existing 2.6.12 Westwood code moved from tcp_input to the new congestion framework. A lot of the inline functions have been eliminated to try and make it clearer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 15 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_westwood.c | 259 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 net/ipv4/tcp_westwood.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 712ebacacb62..adbe855d931a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -452,6 +452,21 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_WESTWOOD + tristate "TCP Westwood+" + depends on INET + default m + ---help--- + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 1d1cac5ac06a..dedfbe62a104 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 000000000000..ef827242c940 --- /dev/null +++ b/net/ipv4/tcp_westwood.c @@ -0,0 +1,259 @@ +/* + * TCP Westwood+ + * + * Angelo Dell'Aera: TCP Westwood+ support + */ + +#include +#include +#include +#include +#include +#include + +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static inline void westwood_filter(struct westwood *w, u32 delta) +{ + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +{ + struct westwood *w = tcp_ca(tp); + if (cnt > 0) + w->rtt = tp->srtt >> 3; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + westwood_update_window(tp); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + w->rtt_min = min(w->rtt, w->rtt_min); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + +static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +{ + return westwood_bw_rttmin(tp); +} + +static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + struct westwood *w = tcp_ca(tp); + + switch(event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(tp); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(tp); + w->bk += westwood_acked_count(tp); + w->rtt_min = min(w->rtt, w->rtt_min); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct rtattr *rta; + struct tcpvegas_info *info; + + rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + info = RTA_DATA(rta); + info->tcpv_enabled = 1; + info->tcpv_rttcnt = 0; + info->tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); + rtattr_failure: ; + } +} + + +static struct tcp_congestion_ops tcp_westwood = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_cwnd_min, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); From a628d29b56d3f420bf3ff1d7543a9caf3ce3b994 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 23 Jun 2005 12:24:58 -0700 Subject: [PATCH 188/199] [TCP]: Add High Speed TCP congestion control module. Sally Floyd's high speed TCP congestion control. This is useful for comparison and research. Signed-off-by: John Heffner Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 11 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_highspeed.c | 181 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 net/ipv4/tcp_highspeed.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index adbe855d931a..910e25b65756 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -467,6 +467,17 @@ config TCP_CONG_WESTWOOD TCP Westwood+ significantly increases fairness wrt TCP Reno in wired networks and throughput over wireless links. +config TCP_CONG_HSTCP + tristate "High Speed TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index dedfbe62a104..ddf5d7785bf4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o +obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 000000000000..36c51f8136bf --- /dev/null +++ b/net/ipv4/tcp_highspeed.c @@ -0,0 +1,181 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner + */ + +#include +#include +#include + + +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, + u32 in_flight, int good) +{ + struct hstcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* Update AIMD parameters */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX) + ca->ai++; + } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai > 0) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd_cnt += ca->ai; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt -= tp->snd_cwnd; + } + } + } +} + +static u32 hstcp_ssthresh(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); From 835b3f0c0d7e1f716c45ec576662eac7a68b8548 Mon Sep 17 00:00:00 2001 From: Daniele Lacamera <(root at danielinux.net)net> Date: Thu, 23 Jun 2005 12:26:34 -0700 Subject: [PATCH 189/199] [TCP]: Add TCP Hybla congestion control module. TCP Hybla congestion avoidance. - "In heterogeneous networks, TCP connections that incorporate a terrestrial or satellite radio link are greatly disadvantaged with respect to entirely wired connections, because of their longer round trip times (RTTs). To cope with this problem, a new TCP proposal, the TCP Hybla, is presented and discussed in the paper[1]. It stems from an analytical evaluation of the congestion window dynamics in the TCP standard versions (Tahoe, Reno, NewReno), which suggests the necessary modifications to remove the performance dependence on RTT.[...]"[1] [1]: Carlo Caini, Rosario Firrincieli, "TCP Hybla: a TCP enhancement for heterogeneous networks", International Journal of Satellite Communications and Networking Volume 22, Issue 5 , Pages 547 - 566. September 2004. Signed-off-by: Daniele Lacamera (root at danielinux.net)net Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_hybla.c | 187 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 net/ipv4/tcp_hybla.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 910e25b65756..516ffe842816 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -478,6 +478,16 @@ config TCP_CONG_HSTCP increase the congestion window by when an ACK is received. For more detail see http://www.icir.org/floyd/hstcp.html +config TCP_CONG_HYBLA + tristate "TCP-Hybla congestion control algorithm" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, expecially when sharing a common bottleneck with normal + terrestrial connections. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ddf5d7785bf4..ede7a5d617ff 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 000000000000..13a66342c304 --- /dev/null +++ b/net/ipv4/tcp_hybla.c @@ -0,0 +1,187 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ + +#include +#include +#include + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(tp); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct tcp_sock *tp, u8 ca_state) +{ + struct hybla *ca = tcp_ca(tp); + + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct hybla *ca = tcp_ca(tp); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(tp); + ca->minrtt = tp->srtt; + } + + if (!ca->hybla_en) + return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + + if (in_flight < tp->snd_cwnd) + return; + + if (ca->rho == 0) + hybla_recalc_param(tp); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<rho) * hybla_fraction(rho_fractions)) + - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while(ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); From b87d8561d8667d221b728ccdcb18eb95b16a687b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:27:19 -0700 Subject: [PATCH 190/199] [TCP]: Add TCP Vegas congestion control module. TCP Vegas code modified for the new TCP infrastructure. Vegas now uses microsecond resolution timestamps for better estimation of performance over higher speed links. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 11 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_vegas.c | 411 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 423 insertions(+) create mode 100644 net/ipv4/tcp_vegas.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 516ffe842816..6c105b60cc00 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -488,6 +488,17 @@ config TCP_CONG_HYBLA involved, expecially when sharing a common bottleneck with normal terrestrial connections. +config TCP_CONG_VEGAS + tristate "TCP Vegas" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ede7a5d617ff..a801a97bd011 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 000000000000..9bd443db5193 --- /dev/null +++ b/net/ipv4/tcp_vegas.c @@ -0,0 +1,411 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int alpha = 1<doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->doing_vegas_now = 0; +} + +static void tcp_vegas_init(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +{ + struct vegas *vegas = tcp_ca(tp); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(tp); + else + vegas_disable(tp); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(tp); +} + +static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct vegas *vegas = tcp_ca(tp); + + if (!vegas->doing_vegas_now) + return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, seq_rtt*1000); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * vegas->baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct vegas *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure: ; + } +} + +static struct tcp_congestion_ops tcp_vegas = { + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); From a7868ea68d29eb2c037952aeb3b549cf05749a18 Mon Sep 17 00:00:00 2001 From: Baruch Even Date: Thu, 23 Jun 2005 12:28:11 -0700 Subject: [PATCH 191/199] [TCP]: Add H-TCP congestion control module. H-TCP is a congestion control algorithm developed at the Hamilton Institute, by Douglas Leith and Robert Shorten. It is extending the standard Reno algorithm with mode switching is thus a relatively simple modification. H-TCP is defined in a layered manner as it is still a research platform. The basic form includes the modification of beta according to the ratio of maxRTT to min RTT and the alpha=2*factor*(1-beta) relation, where factor is dependant on the time since last congestion. The other layers improve convergence by adding appropriate factors to alpha. The following patch implements the H-TCP algorithm in it's basic form. Signed-Off-By: Baruch Even Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 13 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_htcp.c | 289 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 net/ipv4/tcp_htcp.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6c105b60cc00..73a25b52bf7d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -467,6 +467,18 @@ config TCP_CONG_WESTWOOD TCP Westwood+ significantly increases fairness wrt TCP Reno in wired networks and throughput over wireless links. +config TCP_CONG_HTCP + tristate "H-TCP" + depends on INET + default m + ---help--- + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. + config TCP_CONG_HSTCP tristate "High Speed TCP" depends on INET && EXPERIMENTAL @@ -499,6 +511,7 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a801a97bd011..e96ed17deb96 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 000000000000..40168275acf9 --- /dev/null +++ b/net/ipv4/tcp_htcp.c @@ -0,0 +1,289 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + +#include +#include +#include +#include + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u16 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ + u8 ccount; /* Number of RTTs since last congestion event */ + u8 undo_ccount; + u16 packetcount; + u32 minRTT; + u32 maxRTT; + u32 snd_cwnd_cnt2; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_ccount = ca->ccount; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->ccount = 0; + ca->snd_cwnd_cnt2 = 0; +} + +static u32 htcp_cwnd_undo(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + ca->ccount = ca->undo_ccount; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); +} + +static inline void measure_rtt(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 srtt = tp->srtt>>3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +{ + struct htcp *ca = tcp_ca(tp); + u32 now = tcp_time_stamp; + + /* achieved throughput calculations */ + if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) + && now - ca->lasttime >= ca->minRTT + && ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); + if (ca->ccount <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3*ca->Bi + cur_Bi)/4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { + ca->beta = (minRTT<<7)/maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = ca->ccount * minRTT; /* time since last backoff */ + + if (diff > HZ) { + diff -= HZ; + factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ<<3)/(10*minRTT); + scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ + factor = (factor<<3)/scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2*factor*((1<<7)-ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes etc */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; +} + +static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + htcp_param_update(tp); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int data_acked) +{ + struct htcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + measure_rtt(tp); + + /* keep track of number of round-trip times since last backoff event */ + if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { + ca->ccount++; + ca->snd_cwnd_cnt2 = 0; + htcp_alpha_update(ca); + } + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + ca->ccount++; + } + } +} + +/* Lower bound on congestion window. */ +static u32 htcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + + +static void htcp_init(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; +} + +static void htcp_state(struct tcp_sock *tp, u8 new_state) +{ + switch (new_state) { + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(tcp_ca(tp)); + break; + } +} + +static struct tcp_congestion_ops htcp = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .min_cwnd = htcp_min_cwnd, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + if (!use_bandwidth_switch) + htcp.pkts_acked = NULL; + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); From 0e57976b6376f7fda6bef8b7dee2a3c8819ec9e9 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 23 Jun 2005 12:29:07 -0700 Subject: [PATCH 192/199] [TCP]: Add Scalable TCP congestion control module. This patch implements Tom Kelly's Scalable TCP congestion control algorithm for the modular framework. The algorithm has some nice scaling properties, and has been used a fair bit in research, though is known to have significant fairness issues, so it's not really suitable for general purpose use. Signed-off-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 9 ++++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_scalable.c | 68 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 net/ipv4/tcp_scalable.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 73a25b52bf7d..690e88ba2484 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -511,6 +511,15 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. +config TCP_CONG_SCALABLE + tristate "Scalable TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ endmenu diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e96ed17deb96..5718cdb3a61e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 000000000000..70e108e15c71 --- /dev/null +++ b/net/ipv4/tcp_scalable.c @@ -0,0 +1,68 @@ +/* Tom Kelly's Scalable TCP + * + * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * + * John Heffner + */ + +#include +#include +#include + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + } else { + tp->snd_cwnd_cnt++; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); From bf1b8ab6f21e1adbab1abd1b4e71c35fe65dc5fe Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 23 Jun 2005 21:56:45 +0100 Subject: [PATCH 193/199] [PATCH] ARM: 2721/1: remove reliance on udivdi3 for pxafb driver Patch from Nicolas Pitre Signed-off-by: Nicolas Pitre Signed-off-by: Russell King --- drivers/video/pxafb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c index 815fbc8317fc..16e37a535d85 100644 --- a/drivers/video/pxafb.c +++ b/drivers/video/pxafb.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -460,7 +461,7 @@ static inline unsigned int get_pcd(unsigned int pixclock) * speeds */ pcd = (unsigned long long)get_lcdclk_frequency_10khz() * pixclock; - pcd /= 100000000 * 2; + do_div(pcd, 100000000 * 2); /* no need for this, since we should subtract 1 anyway. they cancel */ /* pcd += 1; */ /* make up for integer math truncations */ return (unsigned int)pcd; From c1241c4c3a1507d76c7b987130f2f02f53ecc09f Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 23 Jun 2005 21:56:46 +0100 Subject: [PATCH 194/199] [PATCH] ARM: 2722/1: remove reliance on udivdi3 for nwfpe Patch from Nicolas Pitre Signed-off-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/nwfpe/softfloat-macros | 22 ++++++++++++++++++---- arch/arm/nwfpe/softfloat.c | 12 ++++++++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/arm/nwfpe/softfloat-macros b/arch/arm/nwfpe/softfloat-macros index 5469989f2c5e..5a060f95a58f 100644 --- a/arch/arm/nwfpe/softfloat-macros +++ b/arch/arm/nwfpe/softfloat-macros @@ -563,8 +563,14 @@ static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) bits64 rem0, rem1, term0, term1; bits64 z; if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); - b0 = b>>32; - z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32; + b0 = b>>32; /* hence b0 is 32 bits wide now */ + if ( b0<<32 <= a0 ) { + z = LIT64( 0xFFFFFFFF00000000 ); + } else { + z = a0; + do_div( z, b0 ); + z <<= 32; + } mul64To128( b, z, &term0, &term1 ); sub128( a0, a1, term0, term1, &rem0, &rem1 ); while ( ( (sbits64) rem0 ) < 0 ) { @@ -573,7 +579,12 @@ static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) add128( rem0, rem1, b0, b1, &rem0, &rem1 ); } rem0 = ( rem0<<32 ) | ( rem1>>32 ); - z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; + if ( b0<<32 <= rem0 ) { + z |= 0xFFFFFFFF; + } else { + do_div( rem0, b0 ); + z |= rem0; + } return z; } @@ -601,6 +612,7 @@ static bits32 estimateSqrt32( int16 aExp, bits32 a ) }; int8 index; bits32 z; + bits64 A; index = ( a>>27 ) & 15; if ( aExp & 1 ) { @@ -614,7 +626,9 @@ static bits32 estimateSqrt32( int16 aExp, bits32 a ) z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); } - return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 ); + A = ( (bits64) a )<<31; + do_div( A, z ); + return ( (bits32) A ) + ( z>>1 ); } diff --git a/arch/arm/nwfpe/softfloat.c b/arch/arm/nwfpe/softfloat.c index 9d743ae29062..e038dd3be9b3 100644 --- a/arch/arm/nwfpe/softfloat.c +++ b/arch/arm/nwfpe/softfloat.c @@ -28,6 +28,8 @@ this code that are retained. =============================================================================== */ +#include + #include "fpa11.h" //#include "milieu.h" //#include "softfloat.h" @@ -1331,7 +1333,11 @@ float32 float32_div( float32 a, float32 b ) aSig >>= 1; ++zExp; } - zSig = ( ( (bits64) aSig )<<32 ) / bSig; + { + bits64 tmp = ( (bits64) aSig )<<32; + do_div( tmp, bSig ); + zSig = tmp; + } if ( ( zSig & 0x3F ) == 0 ) { zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 ); } @@ -1397,7 +1403,9 @@ float32 float32_rem( float32 a, float32 b ) q = ( bSig <= aSig ); if ( q ) aSig -= bSig; if ( 0 < expDiff ) { - q = ( ( (bits64) aSig )<<32 ) / bSig; + bits64 tmp = ( (bits64) aSig )<<32; + do_div( tmp, bSig ); + q = tmp; q >>= 32 - expDiff; bSig >>= 2; aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; From d9dc58049d3ed5c63c1a6ac82c217558b4ec623a Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 23 Jun 2005 21:56:46 +0100 Subject: [PATCH 195/199] [PATCH] ARM: 2728/1: S3C2410 - fix constant warning on serial device name Patch from Ben Dooks Remove warning of casting `const char *` to a `char *` type. Signed-off-by: Ben Dooks Signed-off-by: Russell King --- drivers/serial/s3c2410.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c index 2a9f7ade2c9d..5c4678478b1d 100644 --- a/drivers/serial/s3c2410.c +++ b/drivers/serial/s3c2410.c @@ -198,7 +198,7 @@ static inline struct s3c24xx_uart_port *to_ourport(struct uart_port *port) /* translate a port to the device name */ -static inline char *s3c24xx_serial_portname(struct uart_port *port) +static inline const char *s3c24xx_serial_portname(struct uart_port *port) { return to_platform_device(port->dev)->name; } @@ -903,7 +903,7 @@ static void s3c24xx_serial_release_port(struct uart_port *port) static int s3c24xx_serial_request_port(struct uart_port *port) { - char *name = s3c24xx_serial_portname(port); + const char *name = s3c24xx_serial_portname(port); return request_mem_region(port->mapbase, MAP_SIZE, name) ? 0 : -EBUSY; } From d97a666f36cf051e1b1c60505be3d6e9b51f785f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 23 Jun 2005 21:56:47 +0100 Subject: [PATCH 196/199] [PATCH] ARM: 2729/1: DM9000 platform support for S3C2410 machines (BAST, VR1000) Patch from Ben Dooks Add platform_device information for DM9000 chip(s) on the Simtec BAST and the VR1000 board. Signed-off-by: Ben Dooks Signed-off-by: Russell King --- arch/arm/mach-s3c2410/mach-bast.c | 49 ++++++++++++++++-- arch/arm/mach-s3c2410/mach-vr1000.c | 77 +++++++++++++++++++++++++++-- 2 files changed, 117 insertions(+), 9 deletions(-) diff --git a/arch/arm/mach-s3c2410/mach-bast.c b/arch/arm/mach-s3c2410/mach-bast.c index 3bb97eb6e693..f3e970039b65 100644 --- a/arch/arm/mach-s3c2410/mach-bast.c +++ b/arch/arm/mach-s3c2410/mach-bast.c @@ -26,6 +26,7 @@ * 03-Mar-2005 BJD Ensured that bast-cpld.h is included * 10-Mar-2005 LCVR Changed S3C2410_VA to S3C24XX_VA * 14-Mar-2006 BJD Updated for __iomem changes + * 22-Jun-2006 BJD Added DM9000 platform information */ #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include @@ -53,6 +55,7 @@ #include #include #include +#include #include #include @@ -112,7 +115,6 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C2(BAST_VA_ISAMEM), PA_CS2(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C2(BAST_VA_ASIXNET), PA_CS3(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_SUPERIO), PA_CS2(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, - { VA_C2(BAST_VA_DM9000), PA_CS2(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDEPRI), PA_CS3(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDESEC), PA_CS3(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDEPRIAUX), PA_CS3(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -123,7 +125,6 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C3(BAST_VA_ISAMEM), PA_CS3(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C3(BAST_VA_ASIXNET), PA_CS3(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_SUPERIO), PA_CS3(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, - { VA_C3(BAST_VA_DM9000), PA_CS3(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDEPRI), PA_CS3(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDESEC), PA_CS3(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDEPRIAUX), PA_CS3(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -134,7 +135,6 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C4(BAST_VA_ISAMEM), PA_CS4(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C4(BAST_VA_ASIXNET), PA_CS5(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_SUPERIO), PA_CS4(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, - { VA_C4(BAST_VA_DM9000), PA_CS4(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDEPRI), PA_CS5(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDESEC), PA_CS5(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDEPRIAUX), PA_CS5(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -145,7 +145,6 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C5(BAST_VA_ISAMEM), PA_CS5(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C5(BAST_VA_ASIXNET), PA_CS5(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_SUPERIO), PA_CS5(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, - { VA_C5(BAST_VA_DM9000), PA_CS5(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDEPRI), PA_CS5(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDESEC), PA_CS5(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDEPRIAUX), PA_CS5(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -313,6 +312,45 @@ static struct s3c2410_platform_nand bast_nand_info = { .select_chip = bast_nand_select, }; +/* DM9000 */ + +static struct resource bast_dm9k_resource[] = { + [0] = { + .start = S3C2410_CS5 + BAST_PA_DM9000, + .end = S3C2410_CS5 + BAST_PA_DM9000 + 3, + .flags = IORESOURCE_MEM + }, + [1] = { + .start = S3C2410_CS5 + BAST_PA_DM9000 + 0x40, + .end = S3C2410_CS5 + BAST_PA_DM9000 + 0x40 + 0x3f, + .flags = IORESOURCE_MEM + }, + [2] = { + .start = IRQ_DM9000, + .end = IRQ_DM9000, + .flags = IORESOURCE_IRQ + } + +}; + +/* for the moment we limit ourselves to 16bit IO until some + * better IO routines can be written and tested +*/ + +struct dm9000_plat_data bast_dm9k_platdata = { + .flags = DM9000_PLATF_16BITONLY +}; + +static struct platform_device bast_device_dm9k = { + .name = "dm9000", + .id = 0, + .num_resources = ARRAY_SIZE(bast_dm9k_resource), + .resource = bast_dm9k_resource, + .dev = { + .platform_data = &bast_dm9k_platdata, + } +}; + /* Standard BAST devices */ @@ -324,7 +362,8 @@ static struct platform_device *bast_devices[] __initdata = { &s3c_device_iis, &s3c_device_rtc, &s3c_device_nand, - &bast_device_nor + &bast_device_nor, + &bast_device_dm9k, }; static struct clk *bast_clocks[] = { diff --git a/arch/arm/mach-s3c2410/mach-vr1000.c b/arch/arm/mach-s3c2410/mach-vr1000.c index 5512146b1ce4..76be074944a0 100644 --- a/arch/arm/mach-s3c2410/mach-vr1000.c +++ b/arch/arm/mach-s3c2410/mach-vr1000.c @@ -27,6 +27,7 @@ * 10-Feb-2005 BJD Added power-off capability * 10-Mar-2005 LCVR Changed S3C2410_VA to S3C24XX_VA * 14-Mar-2006 BJD void __iomem fixes + * 22-Jun-2006 BJD Added DM9000 platform information */ #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include @@ -98,28 +100,24 @@ static struct map_desc vr1000_iodesc[] __initdata = { * are only 8bit */ /* slow, byte */ - { VA_C2(VR1000_VA_DM9000), PA_CS2(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDEPRI), PA_CS3(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDESEC), PA_CS3(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDEPRIAUX), PA_CS3(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDESECAUX), PA_CS3(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* slow, word */ - { VA_C3(VR1000_VA_DM9000), PA_CS3(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDEPRI), PA_CS3(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDESEC), PA_CS3(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDEPRIAUX), PA_CS3(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDESECAUX), PA_CS3(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* fast, byte */ - { VA_C4(VR1000_VA_DM9000), PA_CS4(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDEPRI), PA_CS5(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDESEC), PA_CS5(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDEPRIAUX), PA_CS5(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDESECAUX), PA_CS5(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* fast, word */ - { VA_C5(VR1000_VA_DM9000), PA_CS5(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDEPRI), PA_CS5(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDESEC), PA_CS5(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDEPRIAUX), PA_CS5(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -246,6 +244,74 @@ static struct platform_device vr1000_nor = { .resource = vr1000_nor_resource, }; +/* DM9000 ethernet devices */ + +static struct resource vr1000_dm9k0_resource[] = { + [0] = { + .start = S3C2410_CS5 + VR1000_PA_DM9000, + .end = S3C2410_CS5 + VR1000_PA_DM9000 + 3, + .flags = IORESOURCE_MEM + }, + [1] = { + .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0x40, + .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0x7f, + .flags = IORESOURCE_MEM + }, + [2] = { + .start = IRQ_VR1000_DM9000A, + .end = IRQ_VR1000_DM9000A, + .flags = IORESOURCE_IRQ + } + +}; + +static struct resource vr1000_dm9k1_resource[] = { + [0] = { + .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0x80, + .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0x83, + .flags = IORESOURCE_MEM + }, + [1] = { + .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0xC0, + .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0xFF, + .flags = IORESOURCE_MEM + }, + [2] = { + .start = IRQ_VR1000_DM9000N, + .end = IRQ_VR1000_DM9000N, + .flags = IORESOURCE_IRQ + } +}; + +/* for the moment we limit ourselves to 16bit IO until some + * better IO routines can be written and tested +*/ + +struct dm9000_plat_data vr1000_dm9k_platdata = { + .flags = DM9000_PLATF_16BITONLY, +}; + +static struct platform_device vr1000_dm9k0 = { + .name = "dm9000", + .id = 0, + .num_resources = ARRAY_SIZE(vr1000_dm9k0_resource), + .resource = vr1000_dm9k0_resource, + .dev = { + .platform_data = &vr1000_dm9k_platdata, + } +}; + +static struct platform_device vr1000_dm9k1 = { + .name = "dm9000", + .id = 1, + .num_resources = ARRAY_SIZE(vr1000_dm9k1_resource), + .resource = vr1000_dm9k1_resource, + .dev = { + .platform_data = &vr1000_dm9k_platdata, + } +}; + +/* devices for this board */ static struct platform_device *vr1000_devices[] __initdata = { &s3c_device_usb, @@ -253,8 +319,11 @@ static struct platform_device *vr1000_devices[] __initdata = { &s3c_device_wdt, &s3c_device_i2c, &s3c_device_iis, + &s3c_device_adc, &serial_device, &vr1000_nor, + &vr1000_dm9k0, + &vr1000_dm9k1 }; static struct clk *vr1000_clocks[] = { From 691027b91be99413dc60fab0902b366434555015 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 23 Jun 2005 21:56:48 +0100 Subject: [PATCH 197/199] [PATCH] ARM: 2730/1: S3C2410 default configuration update Patch from Ben Dooks Add support for the DM9000 and bring default configuration up-to-date with the latest 2.6.12 kernel release Signed-off-by: Ben Dooks Signed-off-by: Russell King --- arch/arm/configs/s3c2410_defconfig | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/arm/configs/s3c2410_defconfig b/arch/arm/configs/s3c2410_defconfig index 2a63fb277196..98b72ff38832 100644 --- a/arch/arm/configs/s3c2410_defconfig +++ b/arch/arm/configs/s3c2410_defconfig @@ -1,14 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-rc1-bk2 -# Sun Mar 27 17:47:45 2005 +# Linux kernel version: 2.6.12-git4 +# Wed Jun 22 15:56:42 2005 # CONFIG_ARM=y CONFIG_MMU=y CONFIG_UID16=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_GENERIC_IOMAP=y # # Code maturity level options @@ -17,6 +16,7 @@ CONFIG_EXPERIMENTAL=y # CONFIG_CLEAN_COMPILE is not set CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -35,6 +35,8 @@ CONFIG_KOBJECT_UEVENT=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y @@ -81,6 +83,7 @@ CONFIG_ARCH_S3C2410=y # CONFIG_ARCH_VERSATILE is not set # CONFIG_ARCH_IMX is not set # CONFIG_ARCH_H720X is not set +# CONFIG_ARCH_AAEC2000 is not set # # S3C24XX Implementations @@ -134,6 +137,7 @@ CONFIG_CPU_TLB_V4WBI=y # # Bus support # +CONFIG_ISA_DMA_API=y # # PCCARD (PCMCIA/CardBus) support @@ -143,7 +147,9 @@ CONFIG_CPU_TLB_V4WBI=y # # Kernel Features # +# CONFIG_SMP is not set # CONFIG_PREEMPT is not set +# CONFIG_DISCONTIGMEM is not set CONFIG_ALIGNMENT_TRAP=y # @@ -297,7 +303,6 @@ CONFIG_PARPORT_1284=y # # Block devices # -# CONFIG_BLK_DEV_FD is not set # CONFIG_PARIDE is not set # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=y @@ -359,6 +364,7 @@ CONFIG_BLK_DEV_IDE_BAST=y # # Fusion MPT device support # +# CONFIG_FUSION is not set # # IEEE 1394 (FireWire) support @@ -378,10 +384,11 @@ CONFIG_NET=y # Networking options # # CONFIG_PACKET is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y +CONFIG_IP_FIB_HASH=y +# CONFIG_IP_FIB_TRIE is not set # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set CONFIG_IP_PNP=y @@ -443,8 +450,9 @@ CONFIG_NETDEVICES=y # Ethernet (10 or 100Mbit) # CONFIG_NET_ETHERNET=y -# CONFIG_MII is not set +CONFIG_MII=m # CONFIG_SMC91X is not set +CONFIG_DM9000=m # # Ethernet (1000 Mbit) @@ -521,7 +529,6 @@ CONFIG_SERIO_SERPORT=y CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set # CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y # # Character devices @@ -605,7 +612,6 @@ CONFIG_S3C2410_RTC=y # # TPM devices # -# CONFIG_TCG_TPM is not set # # I2C support @@ -654,6 +660,7 @@ CONFIG_SENSORS_LM78=m CONFIG_SENSORS_LM85=m # CONFIG_SENSORS_LM87 is not set # CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_SMSC47B397 is not set @@ -665,6 +672,7 @@ CONFIG_SENSORS_LM85=m # # Other I2C Chip support # +# CONFIG_SENSORS_DS1337 is not set CONFIG_SENSORS_EEPROM=m # CONFIG_SENSORS_PCF8574 is not set # CONFIG_SENSORS_PCF8591 is not set @@ -696,8 +704,10 @@ CONFIG_FB=y # CONFIG_FB_CFB_COPYAREA is not set # CONFIG_FB_CFB_IMAGEBLIT is not set # CONFIG_FB_SOFT_CURSOR is not set +# CONFIG_FB_MACMODES is not set CONFIG_FB_MODE_HELPERS=y # CONFIG_FB_TILEBLITTING is not set +# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_VIRTUAL is not set # @@ -782,7 +792,6 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" # CONFIG_PROC_FS=y CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set # CONFIG_DEVPTS_FS_XATTR is not set # CONFIG_TMPFS is not set # CONFIG_HUGETLBFS is not set From 67f7654ea1f11fac1cf4a33bf9a5d9079d122e70 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Jun 2005 22:26:43 +0100 Subject: [PATCH 198/199] [PATCH] Serial: Bugs are not capabilities Signed-off-by: Russell King --- drivers/serial/8250.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 341c644591ae..79f67fd863ec 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -1037,7 +1037,7 @@ static void serial8250_start_tx(struct uart_port *port, unsigned int tty_start) up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); - if (up->capabilities & UART_BUG_TXEN) { + if (up->bugs & UART_BUG_TXEN) { unsigned char lsr, iir; lsr = serial_in(up, UART_LSR); iir = serial_in(up, UART_IIR); @@ -1564,13 +1564,13 @@ static int serial8250_startup(struct uart_port *port) serial_outp(up, UART_IER, 0); if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { - if (!(up->capabilities & UART_BUG_TXEN)) { - up->capabilities |= UART_BUG_TXEN; + if (!(up->bugs & UART_BUG_TXEN)) { + up->bugs |= UART_BUG_TXEN; pr_debug("ttyS%d - enabling bad tx status workarounds\n", port->line); } } else { - up->capabilities &= ~UART_BUG_TXEN; + up->bugs &= ~UART_BUG_TXEN; } spin_unlock_irqrestore(&up->port.lock, flags); From e608a8072b10258aa18c2e33324def225199ba1d Mon Sep 17 00:00:00 2001 From: David Mosberger-Tang Date: Wed, 22 Jun 2005 22:24:00 -0700 Subject: [PATCH 199/199] [IA64] Fix pfn_to_nid() so the kernel compiles again for !CONFIG_DISCONTIGMEM. Signed-off-by: David Mosberger-Tang Signed-off-by: Tony Luck --- include/asm-ia64/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-ia64/mmzone.h b/include/asm-ia64/mmzone.h index 83ca4043fc11..d32f51e3d6c2 100644 --- a/include/asm-ia64/mmzone.h +++ b/include/asm-ia64/mmzone.h @@ -15,6 +15,8 @@ #include #include +#ifdef CONFIG_DISCONTIGMEM + static inline int pfn_to_nid(unsigned long pfn) { #ifdef CONFIG_NUMA @@ -29,8 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#ifdef CONFIG_DISCONTIGMEM - #ifdef CONFIG_IA64_DIG /* DIG systems are small */ # define MAX_PHYSNODE_ID 8 # define NR_NODE_MEMBLKS (MAX_NUMNODES * 8)