commit db930c2399866e4f3611e575349b1da9e957f84d Author: Mihai Carabas Date: Sun Jul 1 22:18:25 2012 +0300 sbuf_vprintf: ap is not consistent at second call of kvsnprintf. Use a copy diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c index f9dad56..023e2fe 100644 --- a/sys/kern/subr_sbuf.c +++ b/sys/kern/subr_sbuf.c @@ -400,7 +400,7 @@ int sbuf_vprintf(struct sbuf *s, const char *fmt, __va_list ap) { int len; - + __va_list ap_copy; assert_sbuf_integrity(s); assert_sbuf_state(s, 0); @@ -411,8 +411,10 @@ sbuf_vprintf(struct sbuf *s, const char *fmt, __va_list ap) return (-1); do { + __va_copy(ap_copy, ap); len = kvsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1, - fmt, ap); + fmt, ap_copy); + __va_end(ap_copy); } while (len > SBUF_FREESPACE(s) && sbuf_extend(s, len - SBUF_FREESPACE(s)) == 0); commit b0bbd261455dd9cdb5575610d5fe8a3975289ed8 Author: Mihai Carabas Date: Wed Aug 22 13:07:18 2012 +0000 ktr - add KTR_COND_LOG * KTR_COND_LOG provides conditional logging; the second parameter passed in is a condition which determines whether the entry is logged or not. * It provides a neater way to log conditionally than having to wrap a KTR in an if, which may then end up being empty if KTR is disbled. diff --git a/share/man/man9/ktr.9 b/share/man/man9/ktr.9 index 9c72cdf..c15f672 100644 --- a/share/man/man9/ktr.9 +++ b/share/man/man9/ktr.9 @@ -24,14 +24,15 @@ .\" .\" $FreeBSD: src/share/man/man9/ktr.9,v 1.8 2005/03/08 01:37:36 hmp Exp $ .\" -.Dd January 2, 2012 +.Dd August 22, 2012 .Dt KTR 9 .Os .Sh NAME .Nm KTR_INFO_MASTER , .Nm KTR_INFO_MASTER_EXTERN , .Nm KTR_INFO , -.Nm KTR_LOG +.Nm KTR_LOG , +.Nm KTR_COND_LOG .Nd kernel tracing facility .Sh SYNOPSIS .In sys/ktr.h @@ -42,6 +43,7 @@ .Fn KTR_INFO_MASTER_EXTERN "master" .Fn KTR_INFO "compile" "master" "name" "maskbit" "format" "type name" "..." .Fn KTR_LOG "info" "arg ..." +.Fn KTR_COND_LOG "info" "cond" "arg ..." .Sh DESCRIPTION The .Nm ktr @@ -104,6 +106,14 @@ string passed to the associated call. .Pp The +.Fn KTR_COND_LOG +macro is equivalent to +.Fn KTR_LOG +except it logs only when the condition specified in +.Fa cond +evaluates to true. +.Pp +The .Va ktr_entries variable contains the number of entries in the .Va ktr_buf @@ -129,6 +139,7 @@ This example demonstrates a simple usage of the KTR facility: KTR_INFO_MASTER(foo); KTR_INFO(KTR_FOO, foo, func1, 0, "func1()"); KTR_INFO(KTR_FOO, foo, func2, 1, "func2(%d)", int arg); +KTR_INFO(KTR_FOO, foo, func3, 2, "func3: arg positive: %d", int arg); \&... @@ -145,6 +156,13 @@ func2(int arg) KTR_LOG(foo_func2, arg); ... } + +void +func3(int arg) +{ + KTR_COND_LOG(foo_func3, arg >= 0, arg); + ... +} .Ed .Sh SEE ALSO .Xr gdb 1 , diff --git a/sys/sys/ktr.h b/sys/sys/ktr.h index 8becde3..7b32bee 100644 --- a/sys/sys/ktr.h +++ b/sys/sys/ktr.h @@ -197,6 +197,25 @@ SYSCTL_DECL(_debug_ktr); } \ } while(0) +#define KTR_COND_LOG(name, cond, ...) \ + do { \ + __ktr_info_ ## name ## _fmtcheck (__ktr_ ## name ## _fmt, ##__VA_ARGS__); \ + if ((cond) && \ + ktr_ ## name ## _enable && \ + (ktr_ ## name ## _mask & *ktr_info_ ## name .kf_master_enable)) { \ + struct ktr_entry *entry; \ + entry = ktr_begin_write_entry(&ktr_info_ ## name, __FILE__, __LINE__); \ + if (!entry) \ + break; \ + *(struct ktr_info_ ## name ## _args *)&entry->ktr_data[0] = \ + (struct ktr_info_ ## name ## _args){ __VA_ARGS__}; \ + if (ktr_finish_write_entry(&ktr_info_ ## name, entry)) { \ + kprintf(ktr_info_ ## name .kf_format, ##__VA_ARGS__); \ + kprintf("\n"); \ + } \ + } \ + } while(0) + #else #define KTR_INFO_MASTER(master) \ @@ -211,6 +230,8 @@ SYSCTL_DECL(_debug_ktr); #define KTR_LOG(info, args...) +#define KTR_COND_LOG(info, args...) + #endif #endif /* !LOCORE */ commit f77c018a1c4b5e9271cf5fcf3912c2ccbea9c0e1 Author: Mihai Carabas Date: Wed Aug 22 10:02:58 2012 +0000 CPU topology support * Part of "Add SMT/HT awareness to DragonFly BSD scheduler" GSoC project. * Details at: http://leaf.dragonflybsd.org/mailarchive/kernel/2012-08/msg00009.html Mentored-by: Alex Hornung (alexh@) Sponsored-by: Google Summer of Code 2012 diff --git a/sys/conf/files b/sys/conf/files index dcd1cad..c21ceca 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -972,6 +972,7 @@ kern/vfs_vopops.c standard kern/vfs_vfsops.c standard kern/kern_threads.c standard kern/vfs_aio.c standard +kern/subr_cpu_topology.c standard vfs/deadfs/dead_vnops.c standard vfs/fdesc/fdesc_vfsops.c optional fdesc vfs/fdesc/fdesc_vnops.c optional fdesc diff --git a/sys/cpu/i386/include/specialreg.h b/sys/cpu/i386/include/specialreg.h index cd60f26..537f166 100644 --- a/sys/cpu/i386/include/specialreg.h +++ b/sys/cpu/i386/include/specialreg.h @@ -184,6 +184,7 @@ #define CPUID_BRAND_INDEX 0x000000ff #define CPUID_CLFUSH_SIZE 0x0000ff00 #define CPUID_HTT_CORES 0x00ff0000 +#define CPUID_HTT_CORE_SHIFT 16 #define CPUID_LOCAL_APIC_ID 0xff000000 /* @@ -194,6 +195,27 @@ #define CPUID_TYPE_CORE 2 /* + * INTEL Deterministic Cache Parameters + * (Function 04h) + */ +#define FUNC_4_MAX_CORE_NO(eax) ((((eax) >> 26) & 0x3f)) + +/* + * INTEL x2APIC Features / Processor topology + * (Function 0Bh) + */ +#define FUNC_B_THREAD_LEVEL 0 + +#define FUNC_B_INVALID_TYPE 0 +#define FUNC_B_THREAD_TYPE 1 +#define FUNC_B_CORE_TYPE 2 + +#define FUNC_B_TYPE(ecx) (((ecx) >> 8) & 0xff) +#define FUNC_B_BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define FUNC_B_LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + + +/* * AMD extended function 8000_0007h edx info */ #define AMDPM_TS 0x00000001 @@ -210,6 +232,8 @@ * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff +#define AMDID_COREID_SIZE 0x0000f000 +#define AMDID_COREID_SIZE_SHIFT 12 /* * CPUID manufacturers identifiers diff --git a/sys/cpu/x86_64/include/specialreg.h b/sys/cpu/x86_64/include/specialreg.h index 77d4a84..b039cc9 100644 --- a/sys/cpu/x86_64/include/specialreg.h +++ b/sys/cpu/x86_64/include/specialreg.h @@ -186,12 +186,35 @@ #define CPUID_BRAND_INDEX 0x000000ff #define CPUID_CLFUSH_SIZE 0x0000ff00 #define CPUID_HTT_CORES 0x00ff0000 +#define CPUID_HTT_CORE_SHIFT 16 #define CPUID_LOCAL_APIC_ID 0xff000000 /* * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff +#define AMDID_COREID_SIZE 0x0000f000 +#define AMDID_COREID_SIZE_SHIFT 12 + +/* + * INTEL Deterministic Cache Parameters + * (Function 04h) + */ +#define FUNC_4_MAX_CORE_NO(eax) ((((eax) >> 26) & 0x3f)) + +/* + * INTEL x2APIC Features / Processor topology + * (Function 0Bh) + */ +#define FUNC_B_THREAD_LEVEL 0 + +#define FUNC_B_INVALID_TYPE 0 +#define FUNC_B_THREAD_TYPE 1 +#define FUNC_B_CORE_TYPE 2 + +#define FUNC_B_TYPE(ecx) (((ecx) >> 8) & 0xff) +#define FUNC_B_BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define FUNC_B_LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) /* * CPUID manufacturers identifiers diff --git a/sys/kern/subr_cpu_topology.c b/sys/kern/subr_cpu_topology.c new file mode 100644 index 0000000..3efd419 --- /dev/null +++ b/sys/kern/subr_cpu_topology.c @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2012 The DragonFly Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef SMP + +#ifndef NAPICID +#define NAPICID 256 +#endif + +#define INDENT_BUF_SIZE LEVEL_NO*3 +#define INVALID_ID -1 + +/* Per-cpu sysctl nodes and info */ +struct per_cpu_sysctl_info { + struct sysctl_ctx_list sysctl_ctx; + struct sysctl_oid *sysctl_tree; + char cpu_name[32]; + int physical_id; + int core_id; + char physical_siblings[8*MAXCPU]; + char core_siblings[8*MAXCPU]; +}; +typedef struct per_cpu_sysctl_info per_cpu_sysctl_info_t; + +static cpu_node_t cpu_topology_nodes[MAXCPU]; /* Memory for topology */ +static cpu_node_t *cpu_root_node; /* Root node pointer */ + +static struct sysctl_ctx_list cpu_topology_sysctl_ctx; +static struct sysctl_oid *cpu_topology_sysctl_tree; +static char cpu_topology_members[8*MAXCPU]; +static per_cpu_sysctl_info_t pcpu_sysctl[MAXCPU]; + +int cpu_topology_levels_number = 1; + +/* Get the next valid apicid starting + * from current apicid (curr_apicid + */ +static int +get_next_valid_apicid(int curr_apicid) +{ + int next_apicid = curr_apicid; + do { + next_apicid++; + } + while(get_cpuid_from_apicid(next_apicid) == -1 && + next_apicid < NAPICID); + if (next_apicid == NAPICID) { + kprintf("Warning: No next valid APICID found. Returning -1\n"); + return -1; + } + return next_apicid; +} + +/* Generic topology tree. The parameters have the following meaning: + * - children_no_per_level : the number of children on each level + * - level_types : the type of the level (THREAD, CORE, CHIP, etc) + * - cur_level : the current level of the tree + * - node : the current node + * - last_free_node : the last free node in the global array. + * - cpuid : basicly this are the ids of the leafs + */ +static void +build_topology_tree(int *children_no_per_level, + uint8_t *level_types, + int cur_level, + cpu_node_t *node, + cpu_node_t **last_free_node, + int *apicid) +{ + int i; + + node->child_no = children_no_per_level[cur_level]; + node->type = level_types[cur_level]; + node->members = 0; + + if (node->child_no == 0) { + node->child_node = NULL; + *apicid = get_next_valid_apicid(*apicid); + node->members = CPUMASK(get_cpuid_from_apicid(*apicid)); + return; + } + + node->child_node = *last_free_node; + (*last_free_node) += node->child_no; + + for (i = 0; i < node->child_no; i++) { + + node->child_node[i].parent_node = node; + + build_topology_tree(children_no_per_level, + level_types, + cur_level + 1, + &(node->child_node[i]), + last_free_node, + apicid); + + node->members |= node->child_node[i].members; + } +} + +/* Build CPU topology. The detection is made by comparing the + * chip, core and logical IDs of each CPU with the IDs of the + * BSP. When we found a match, at that level the CPUs are siblings. + */ +static cpu_node_t * +build_cpu_topology(void) +{ + detect_cpu_topology(); + int i; + int BSPID = 0; + int threads_per_core = 0; + int cores_per_chip = 0; + int chips_per_package = 0; + int children_no_per_level[LEVEL_NO]; + uint8_t level_types[LEVEL_NO]; + int apicid = -1; + + cpu_node_t *root = &cpu_topology_nodes[0]; + cpu_node_t *last_free_node = root + 1; + + /* Assume that the topology is uniform. + * Find the number of siblings within chip + * and witin core to build up the topology + */ + for (i = 0; i < ncpus; i++) { + + cpumask_t mask = CPUMASK(i); + + if ((mask & smp_active_mask) == 0) + continue; + + if (get_chip_ID(BSPID) == get_chip_ID(i)) + cores_per_chip++; + else + continue; + + if (get_core_number_within_chip(BSPID) == + get_core_number_within_chip(i)) + threads_per_core++; + } + + cores_per_chip /= threads_per_core; + chips_per_package = ncpus / (cores_per_chip * threads_per_core); + + if (bootverbose) + kprintf("CPU Topology: cores_per_chip: %d; threads_per_core: %d; chips_per_package: %d;\n", + cores_per_chip, threads_per_core, chips_per_package); + + if (threads_per_core > 1) { /* HT available - 4 levels */ + + children_no_per_level[0] = chips_per_package; + children_no_per_level[1] = cores_per_chip; + children_no_per_level[2] = threads_per_core; + children_no_per_level[3] = 0; + + level_types[0] = PACKAGE_LEVEL; + level_types[1] = CHIP_LEVEL; + level_types[2] = CORE_LEVEL; + level_types[3] = THREAD_LEVEL; + + build_topology_tree(children_no_per_level, + level_types, + 0, + root, + &last_free_node, + &apicid); + + cpu_topology_levels_number = 4; + + } else if (cores_per_chip > 1) { /* No HT available - 3 levels */ + + children_no_per_level[0] = chips_per_package; + children_no_per_level[1] = cores_per_chip; + children_no_per_level[2] = 0; + + level_types[0] = PACKAGE_LEVEL; + level_types[1] = CHIP_LEVEL; + level_types[2] = CORE_LEVEL; + + build_topology_tree(children_no_per_level, + level_types, + 0, + root, + &last_free_node, + &apicid); + + cpu_topology_levels_number = 3; + + } else { /* No HT and no Multi-Core - 2 levels */ + + children_no_per_level[0] = chips_per_package; + children_no_per_level[1] = 0; + + level_types[0] = PACKAGE_LEVEL; + level_types[1] = CHIP_LEVEL; + + build_topology_tree(children_no_per_level, + level_types, + 0, + root, + &last_free_node, + &apicid); + + cpu_topology_levels_number = 2; + + } + + return root; +} + +/* Recursive function helper to print the CPU topology tree */ +static void +print_cpu_topology_tree_sysctl_helper(cpu_node_t *node, + struct sbuf *sb, + char * buf, + int buf_len, + int last) +{ + int i; + int bsr_member; + + sbuf_bcat(sb, buf, buf_len); + if (last) { + sbuf_printf(sb, "\\-"); + buf[buf_len] = ' ';buf_len++; + buf[buf_len] = ' ';buf_len++; + } else { + sbuf_printf(sb, "|-"); + buf[buf_len] = '|';buf_len++; + buf[buf_len] = ' ';buf_len++; + } + + bsr_member = BSRCPUMASK(node->members); + + if (node->type == PACKAGE_LEVEL) { + sbuf_printf(sb,"PACKAGE MEMBERS: "); + } else if (node->type == CHIP_LEVEL) { + sbuf_printf(sb,"CHIP ID %d: ", + get_chip_ID(bsr_member)); + } else if (node->type == CORE_LEVEL) { + sbuf_printf(sb,"CORE ID %d: ", + get_core_number_within_chip(bsr_member)); + } else if (node->type == THREAD_LEVEL) { + sbuf_printf(sb,"THREAD ID %d: ", + get_logical_CPU_number_within_core(bsr_member)); + } else { + sbuf_printf(sb,"UNKNOWN: "); + } + CPUSET_FOREACH(i, node->members) { + sbuf_printf(sb,"cpu%d ", i); + } + + sbuf_printf(sb,"\n"); + + for (i = 0; i < node->child_no; i++) { + print_cpu_topology_tree_sysctl_helper(&(node->child_node[i]), + sb, buf, buf_len, i == (node->child_no -1)); + } +} + +/* SYSCTL PROCEDURE for printing the CPU Topology tree */ +static int +print_cpu_topology_tree_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + int ret; + char buf[INDENT_BUF_SIZE]; + + KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized")); + + sb = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND); + if (sb == NULL) { + return (ENOMEM); + } + sbuf_printf(sb,"\n"); + print_cpu_topology_tree_sysctl_helper(cpu_root_node, sb, buf, 0, 1); + + sbuf_finish(sb); + + ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + + sbuf_delete(sb); + + return ret; +} + +/* SYSCTL PROCEDURE for printing the CPU Topology level description */ +static int +print_cpu_topology_level_description_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + int ret; + + sb = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND); + if (sb == NULL) + return (ENOMEM); + + if (cpu_topology_levels_number == 4) /* HT available */ + sbuf_printf(sb, "0 - thread; 1 - core; 2 - socket; 3 - anything"); + else if (cpu_topology_levels_number == 3) /* No HT available */ + sbuf_printf(sb, "0 - core; 1 - socket; 2 - anything"); + else if (cpu_topology_levels_number == 2) /* No HT and no Multi-Core */ + sbuf_printf(sb, "0 - socket; 1 - anything"); + else + sbuf_printf(sb, "Unknown"); + + sbuf_finish(sb); + + ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + + sbuf_delete(sb); + + return ret; +} + +/* Find a cpu_node_t by a mask */ +static cpu_node_t * +get_cpu_node_by_cpumask(cpu_node_t * node, + cpumask_t mask) { + + cpu_node_t * found = NULL; + int i; + + if (node->members == mask) { + return node; + } + + for (i = 0; i < node->child_no; i++) { + found = get_cpu_node_by_cpumask(&(node->child_node[i]), mask); + if (found != NULL) { + return found; + } + } + return NULL; +} + +cpu_node_t * +get_cpu_node_by_cpuid(int cpuid) { + cpumask_t mask = CPUMASK(cpuid); + + KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized")); + + return get_cpu_node_by_cpumask(cpu_root_node, mask); +} + +/* Get the mask of siblings for level_type of a cpuid */ +cpumask_t +get_cpumask_from_level(int cpuid, + uint8_t level_type) +{ + cpu_node_t * node; + cpumask_t mask = CPUMASK(cpuid); + + KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized")); + + node = get_cpu_node_by_cpumask(cpu_root_node, mask); + + if (node == NULL) { + return 0; + } + + while (node != NULL) { + if (node->type == level_type) { + return node->members; + } + node = node->parent_node; + } + + return 0; +} + +/* init pcpu_sysctl structure info */ +static void +init_pcpu_topology_sysctl(void) +{ + int cpu; + int i; + cpumask_t mask; + struct sbuf sb; + + for (i = 0; i < ncpus; i++) { + + sbuf_new(&sb, pcpu_sysctl[i].cpu_name, + sizeof(pcpu_sysctl[i].cpu_name), SBUF_FIXEDLEN); + sbuf_printf(&sb,"cpu%d", i); + sbuf_finish(&sb); + + + /* Get physical siblings */ + mask = get_cpumask_from_level(i, CHIP_LEVEL); + if (mask == 0) { + pcpu_sysctl[i].physical_id = INVALID_ID; + continue; + } + + sbuf_new(&sb, pcpu_sysctl[i].physical_siblings, + sizeof(pcpu_sysctl[i].physical_siblings), SBUF_FIXEDLEN); + CPUSET_FOREACH(cpu, mask) { + sbuf_printf(&sb,"cpu%d ", cpu); + } + sbuf_trim(&sb); + sbuf_finish(&sb); + + pcpu_sysctl[i].physical_id = get_chip_ID(i); + + /* Get core siblings */ + mask = get_cpumask_from_level(i, CORE_LEVEL); + if (mask == 0) { + pcpu_sysctl[i].core_id = INVALID_ID; + continue; + } + + sbuf_new(&sb, pcpu_sysctl[i].core_siblings, + sizeof(pcpu_sysctl[i].core_siblings), SBUF_FIXEDLEN); + CPUSET_FOREACH(cpu, mask) { + sbuf_printf(&sb,"cpu%d ", cpu); + } + sbuf_trim(&sb); + sbuf_finish(&sb); + + pcpu_sysctl[i].core_id = get_core_number_within_chip(i); + + } +} + +/* Build SYSCTL structure for revealing + * the CPU Topology to user-space. + */ +static void +build_sysctl_cpu_topology(void) +{ + int i; + struct sbuf sb; + + /* SYSCTL new leaf for "cpu_topology" */ + sysctl_ctx_init(&cpu_topology_sysctl_ctx); + cpu_topology_sysctl_tree = SYSCTL_ADD_NODE(&cpu_topology_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_hw), + OID_AUTO, + "cpu_topology", + CTLFLAG_RD, 0, ""); + + /* SYSCTL cpu_topology "tree" entry */ + SYSCTL_ADD_PROC(&cpu_topology_sysctl_ctx, + SYSCTL_CHILDREN(cpu_topology_sysctl_tree), + OID_AUTO, "tree", CTLTYPE_STRING | CTLFLAG_RD, + NULL, 0, print_cpu_topology_tree_sysctl, "A", + "Tree print of CPU topology"); + + /* SYSCTL cpu_topology "level_description" entry */ + SYSCTL_ADD_PROC(&cpu_topology_sysctl_ctx, + SYSCTL_CHILDREN(cpu_topology_sysctl_tree), + OID_AUTO, "level_description", CTLTYPE_STRING | CTLFLAG_RD, + NULL, 0, print_cpu_topology_level_description_sysctl, "A", + "Level description of CPU topology"); + + /* SYSCTL cpu_topology "members" entry */ + sbuf_new(&sb, cpu_topology_members, + sizeof(cpu_topology_members), SBUF_FIXEDLEN); + CPUSET_FOREACH(i, cpu_root_node->members) { + sbuf_printf(&sb,"cpu%d ", i); + } + sbuf_trim(&sb); + sbuf_finish(&sb); + SYSCTL_ADD_STRING(&cpu_topology_sysctl_ctx, + SYSCTL_CHILDREN(cpu_topology_sysctl_tree), + OID_AUTO, "members", CTLFLAG_RD, + cpu_topology_members, 0, + "Members of the CPU Topology"); + + /* SYSCTL per_cpu info */ + for (i = 0; i < ncpus; i++) { + /* New leaf : hw.cpu_topology.cpux */ + sysctl_ctx_init(&pcpu_sysctl[i].sysctl_ctx); + pcpu_sysctl[i].sysctl_tree = SYSCTL_ADD_NODE(&pcpu_sysctl[i].sysctl_ctx, + SYSCTL_CHILDREN(cpu_topology_sysctl_tree), + OID_AUTO, + pcpu_sysctl[i].cpu_name, + CTLFLAG_RD, 0, ""); + + /* Check if the physical_id found is valid */ + if (pcpu_sysctl[i].physical_id == INVALID_ID) { + continue; + } + + /* Add physical id info */ + SYSCTL_ADD_INT(&pcpu_sysctl[i].sysctl_ctx, + SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree), + OID_AUTO, "physical_id", CTLFLAG_RD, + &pcpu_sysctl[i].physical_id, 0, + "Physical ID"); + + /* Add physical siblings */ + SYSCTL_ADD_STRING(&pcpu_sysctl[i].sysctl_ctx, + SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree), + OID_AUTO, "physical_siblings", CTLFLAG_RD, + pcpu_sysctl[i].physical_siblings, 0, + "Physical siblings"); + + /* Check if the core_id found is valid */ + if (pcpu_sysctl[i].core_id == INVALID_ID) { + continue; + } + + /* Add core id info */ + SYSCTL_ADD_INT(&pcpu_sysctl[i].sysctl_ctx, + SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree), + OID_AUTO, "core_id", CTLFLAG_RD, + &pcpu_sysctl[i].core_id, 0, + "Core ID"); + + /*Add core siblings */ + SYSCTL_ADD_STRING(&pcpu_sysctl[i].sysctl_ctx, + SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree), + OID_AUTO, "core_siblings", CTLFLAG_RD, + pcpu_sysctl[i].core_siblings, 0, + "Core siblings"); + } +} + +/* Build the CPU Topology and SYSCTL Topology tree */ +static void +init_cpu_topology(void) +{ + cpu_root_node = build_cpu_topology(); + + init_pcpu_topology_sysctl(); + build_sysctl_cpu_topology(); +} +SYSINIT(cpu_topology, SI_BOOT2_CPU_TOPOLOGY, SI_ORDER_FIRST, + init_cpu_topology, NULL) +#endif diff --git a/sys/platform/pc32/i386/identcpu.c b/sys/platform/pc32/i386/identcpu.c index 2daa75f..e5c9ab1 100644 --- a/sys/platform/pc32/i386/identcpu.c +++ b/sys/platform/pc32/i386/identcpu.c @@ -679,8 +679,9 @@ printcpuinfo(void) if (cpu_vendor_id == CPU_VENDOR_CYRIX) kprintf(" DIR=0x%04x", cyrix_did); if (cpu_high > 0) { +#if 0 u_int cmp = 1, htt = 1; - +#endif /* * Here we should probably set up flags indicating * whether or not various features are available. @@ -851,14 +852,24 @@ printcpuinfo(void) if (cpu_vendor_id == CPU_VENDOR_CENTAUR) print_via_padlock_info(); + /* + * INVALID CPU TOPOLOGY INFORMATION PRINT + * DEPRECATED - CPU_TOPOLOGY_DETECTION moved to + * - sys/platform/pc64/x86_64/mp_machdep.c + * - sys/kern/subr_cpu_topology + */ + +#if 0 if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_AMD) cpu_feature &= ~CPUID_HTT; +#endif /* * If this CPU supports HTT or CMP then mention the * number of physical/logical cores it contains. */ +#if 0 if (cpu_feature & CPUID_HTT) htt = (cpu_procinfo & CPUID_HTT_CORES) >> 16; if (cpu_vendor_id == CPU_VENDOR_AMD && @@ -870,12 +881,13 @@ printcpuinfo(void) if ((regs[0] & 0x1f) != 0) cmp = ((regs[0] >> 26) & 0x3f) + 1; } - +#endif #ifdef foo /* * XXX For Intel CPUs, this is max number of cores per * package, not the actual cores per package. */ +#if 0 cpu_cores = cmp; cpu_logical = htt / cmp; @@ -886,6 +898,7 @@ printcpuinfo(void) cpu_logical); } #endif +#endif #if 0 /* diff --git a/sys/platform/pc32/i386/mp_machdep.c b/sys/platform/pc32/i386/mp_machdep.c index 8fc44d2..ac89595 100644 --- a/sys/platform/pc32/i386/mp_machdep.c +++ b/sys/platform/pc32/i386/mp_machdep.c @@ -178,6 +178,11 @@ static cpumask_t smp_lapic_mask = 1; /* which cpus have lapic been inited */ cpumask_t smp_active_mask = 1; /* which cpus are ready for IPIs etc? */ SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RD, &smp_active_mask, 0, ""); +/* Local data for detecting CPU TOPOLOGY */ +static int core_bits = 0; +static int logical_CPU_bits = 0; + + /* * Calculate usable address in base memory for AP trampoline code. */ @@ -1092,3 +1097,177 @@ mp_bsp_simple_setup(void) if (cpu_feature & CPUID_TSC) tsc0_offset = rdtsc(); } + + +/* + * CPU TOPOLOGY DETECTION FUNCTIONS + */ + +/* Detect intel topology using CPUID + * Ref: http://www.intel.com/Assets/PDF/appnote/241618.pdf, pg 41 + */ +static void +detect_intel_topology(int count_htt_cores) +{ + int shift = 0; + int ecx_index = 0; + int core_plus_logical_bits = 0; + int cores_per_package; + int logical_per_package; + int logical_per_core; + unsigned int p[4]; + + if (cpu_high >= 0xb) { + goto FUNC_B; + + } else if (cpu_high >= 0x4) { + goto FUNC_4; + + } else { + core_bits = 0; + for (shift = 0; (1 << shift) < count_htt_cores; ++shift) + ; + logical_CPU_bits = 1 << shift; + return; + } + +FUNC_B: + cpuid_count(0xb, FUNC_B_THREAD_LEVEL, p); + + /* if 0xb not supported - fallback to 0x4 */ + if (p[1] == 0 || (FUNC_B_TYPE(p[2]) != FUNC_B_THREAD_TYPE)) { + goto FUNC_4; + } + + logical_CPU_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); + + ecx_index = FUNC_B_THREAD_LEVEL + 1; + do { + cpuid_count(0xb, ecx_index, p); + /* Check for the Core type in the implemented sub leaves. */ + if (FUNC_B_TYPE(p[2]) == FUNC_B_CORE_TYPE) { + core_plus_logical_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); + break; + } + ecx_index++; + } while (FUNC_B_TYPE(p[2]) != FUNC_B_INVALID_TYPE); + + core_bits = core_plus_logical_bits - logical_CPU_bits; + + return; + +FUNC_4: + cpuid_count(0x4, 0, p); + cores_per_package = FUNC_4_MAX_CORE_NO(p[0]) + 1; + + logical_per_package = count_htt_cores; + logical_per_core = logical_per_package / cores_per_package; + + for (shift = 0; (1 << shift) < logical_per_core; ++shift) + ; + logical_CPU_bits = shift; + + for (shift = 0; (1 << shift) < cores_per_package; ++shift) + ; + core_bits = shift; + + return; +} + +/* Detect AMD topology using CPUID + * Ref: http://support.amd.com/us/Embedded_TechDocs/25481.pdf, last page + */ +static void +detect_amd_topology(int count_htt_cores) +{ + int shift = 0; + + if ((cpu_feature & CPUID_HTT) + && (amd_feature2 & AMDID2_CMP)) { + + if (cpu_procinfo2 & AMDID_COREID_SIZE) { + core_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) + >> AMDID_COREID_SIZE_SHIFT; + } else { + core_bits = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; + for (shift = 0; (1 << shift) < core_bits; ++shift); + core_bits = shift; + } + + logical_CPU_bits = count_htt_cores >> core_bits; + for (shift = 0; (1 << shift) < logical_CPU_bits; ++shift) + ; + logical_CPU_bits = shift; + } else { + for (shift = 0; (1 << shift) < count_htt_cores; ++shift) + ; + core_bits = shift; + logical_CPU_bits = 0; + } +} + +/* Calculate + * - logical_CPU_bits + * - core_bits + * With the values above (for AMD or INTEL) we are able to generally + * detect the CPU topology (number of cores for each level): + * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) + * Ref: http://www.multicoreinfo.com/research/papers/whitepapers/Intel-detect-topology.pdf + */ +void +detect_cpu_topology(void) +{ + static int topology_detected = 0; + int count = 0; + + if (topology_detected) { + goto OUT; + } + + if ((cpu_feature & CPUID_HTT) == 0) { + core_bits = 0; + logical_CPU_bits = 0; + goto OUT; + } else { + count = (cpu_procinfo & CPUID_HTT_CORES) + >> CPUID_HTT_CORE_SHIFT; + } + + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + detect_intel_topology(count); + } else if (cpu_vendor_id == CPU_VENDOR_AMD) { + detect_amd_topology(count); + } + +OUT: + if (bootverbose) + kprintf("BITS within APICID: logical_CPU_bits: %d; core_bits: %d\n", + logical_CPU_bits, core_bits); + + topology_detected = 1; +} + +/* Interface functions to calculate chip_ID, + * core_number and logical_number + * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) + */ +int +get_chip_ID(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) >> + (logical_CPU_bits + core_bits); +} + +int +get_core_number_within_chip(int cpuid) +{ + return (get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & + ( (1 << core_bits) -1); +} + +int +get_logical_CPU_number_within_core(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) & + ( (1 << logical_CPU_bits) -1); +} diff --git a/sys/platform/pc32/include/smp.h b/sys/platform/pc32/include/smp.h index 2d4dc67..a46df3a 100644 --- a/sys/platform/pc32/include/smp.h +++ b/sys/platform/pc32/include/smp.h @@ -74,6 +74,24 @@ int cpu_send_ipiq_passive (int); /* global data in init_smp.c */ extern cpumask_t smp_active_mask; +/* Detect CPU topology bits */ +void detect_cpu_topology(void); + +/* Interface functions for IDs calculation */ +int get_chip_ID(int cpuid); +int get_core_number_within_chip(int cpuid); +int get_logical_CPU_number_within_core(int cpuid); + +#include +static __inline +int get_apicid_from_cpuid(int cpuid) { + return CPUID_TO_APICID(cpuid); +} +static __inline +int get_cpuid_from_apicid(int cpuid) { + return APICID_TO_CPUID(cpuid); +} + #endif /* !LOCORE */ #else /* !SMP */ diff --git a/sys/platform/pc64/include/smp.h b/sys/platform/pc64/include/smp.h index 8b62501..42f8451 100644 --- a/sys/platform/pc64/include/smp.h +++ b/sys/platform/pc64/include/smp.h @@ -83,6 +83,24 @@ int cpu_send_ipiq_passive (int); /* global data in init_smp.c */ extern cpumask_t smp_active_mask; +/* Detect CPU topology bits */ +void detect_cpu_topology(void); + +/* Interface functions for IDs calculation */ +int get_chip_ID(int cpuid); +int get_core_number_within_chip(int cpuid); +int get_logical_CPU_number_within_core(int cpuid); + +#include +static __inline +int get_apicid_from_cpuid(int cpuid) { + return CPUID_TO_APICID(cpuid); +} +static __inline +int get_cpuid_from_apicid(int apicid) { + return APICID_TO_CPUID(apicid); +} + #endif /* !LOCORE */ #else /* !SMP */ diff --git a/sys/platform/pc64/x86_64/identcpu.c b/sys/platform/pc64/x86_64/identcpu.c index 9aea607..2cd4ad6 100644 --- a/sys/platform/pc64/x86_64/identcpu.c +++ b/sys/platform/pc64/x86_64/identcpu.c @@ -190,7 +190,9 @@ printcpuinfo(void) cpu_vendor_id == CPU_VENDOR_CENTAUR) { kprintf(" Stepping = %u", cpu_id & 0xf); if (cpu_high > 0) { +#if 0 u_int cmp = 1, htt = 1; +#endif /* * Here we should probably set up flags indicating @@ -361,15 +363,24 @@ printcpuinfo(void) if (cpu_vendor_id == CPU_VENDOR_CENTAUR) print_via_padlock_info(); + /* + * INVALID CPU TOPOLOGY INFORMATION PRINT + * DEPRECATED - CPU_TOPOLOGY_DETECTION moved to + * - sys/platform/pc64/x86_64/mp_machdep.c + * - sys/kern/subr_cpu_topology + */ +#if 0 if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_AMD) cpu_feature &= ~CPUID_HTT; +#endif /* * If this CPU supports HTT or CMP then mention the * number of physical/logical cores it contains. */ +#if 0 if (cpu_feature & CPUID_HTT) htt = (cpu_procinfo & CPUID_HTT_CORES) >> 16; if (cpu_vendor_id == CPU_VENDOR_AMD && @@ -381,12 +392,13 @@ printcpuinfo(void) if ((regs[0] & 0x1f) != 0) cmp = ((regs[0] >> 26) & 0x3f) + 1; } - +#endif #ifdef foo /* * XXX For Intel CPUs, this is max number of cores per * package, not the actual cores per package. */ +#if 0 cpu_cores = cmp; cpu_logical = htt / cmp; @@ -397,6 +409,7 @@ printcpuinfo(void) cpu_logical); } #endif +#endif } } /* Avoid ugly blank lines: only print newline when we have to. */ diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index ee0ac0c..b97cee9 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -172,6 +173,11 @@ cpumask_t smp_active_mask = 1; /* which cpus are ready for IPIs etc? */ SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RD, &smp_active_mask, 0, ""); static u_int bootMP_size; +/* Local data for detecting CPU TOPOLOGY */ +static int core_bits = 0; +static int logical_CPU_bits = 0; + + /* * Calculate usable address in base memory for AP trampoline code. */ @@ -1111,3 +1117,180 @@ mp_bsp_simple_setup(void) if (cpu_feature & CPUID_TSC) tsc0_offset = rdtsc(); } + + +/* + * CPU TOPOLOGY DETECTION FUNCTIONS + */ + +/* Detect intel topology using CPUID + * Ref: http://www.intel.com/Assets/PDF/appnote/241618.pdf, pg 41 + */ +static void +detect_intel_topology(int count_htt_cores) +{ + int shift = 0; + int ecx_index = 0; + int core_plus_logical_bits = 0; + int cores_per_package; + int logical_per_package; + int logical_per_core; + unsigned int p[4]; + + if (cpu_high >= 0xb) { + goto FUNC_B; + + } else if (cpu_high >= 0x4) { + goto FUNC_4; + + } else { + core_bits = 0; + for (shift = 0; (1 << shift) < count_htt_cores; ++shift) + ; + logical_CPU_bits = 1 << shift; + return; + } + +FUNC_B: + cpuid_count(0xb, FUNC_B_THREAD_LEVEL, p); + + /* if 0xb not supported - fallback to 0x4 */ + if (p[1] == 0 || (FUNC_B_TYPE(p[2]) != FUNC_B_THREAD_TYPE)) { + goto FUNC_4; + } + + logical_CPU_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); + + ecx_index = FUNC_B_THREAD_LEVEL + 1; + do { + cpuid_count(0xb, ecx_index, p); + + /* Check for the Core type in the implemented sub leaves. */ + if (FUNC_B_TYPE(p[2]) == FUNC_B_CORE_TYPE) { + core_plus_logical_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); + break; + } + + ecx_index++; + + } while (FUNC_B_TYPE(p[2]) != FUNC_B_INVALID_TYPE); + + core_bits = core_plus_logical_bits - logical_CPU_bits; + + return; + +FUNC_4: + cpuid_count(0x4, 0, p); + cores_per_package = FUNC_4_MAX_CORE_NO(p[0]) + 1; + + logical_per_package = count_htt_cores; + logical_per_core = logical_per_package / cores_per_package; + + for (shift = 0; (1 << shift) < logical_per_core; ++shift) + ; + logical_CPU_bits = shift; + + for (shift = 0; (1 << shift) < cores_per_package; ++shift) + ; + core_bits = shift; + + return; +} + +/* Detect AMD topology using CPUID + * Ref: http://support.amd.com/us/Embedded_TechDocs/25481.pdf, last page + */ +static void +detect_amd_topology(int count_htt_cores) +{ + int shift = 0; + if ((cpu_feature & CPUID_HTT) + && (amd_feature2 & AMDID2_CMP)) { + + if (cpu_procinfo2 & AMDID_COREID_SIZE) { + core_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) + >> AMDID_COREID_SIZE_SHIFT; + } else { + core_bits = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; + for (shift = 0; (1 << shift) < core_bits; ++shift) + ; + core_bits = shift; + } + + logical_CPU_bits = count_htt_cores >> core_bits; + for (shift = 0; (1 << shift) < logical_CPU_bits; ++shift) + ; + logical_CPU_bits = shift; + } else { + for (shift = 0; (1 << shift) < count_htt_cores; ++shift) + ; + core_bits = shift; + logical_CPU_bits = 0; + } +} + +/* Calculate + * - logical_CPU_bits + * - core_bits + * With the values above (for AMD or INTEL) we are able to generally + * detect the CPU topology (number of cores for each level): + * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) + * Ref: http://www.multicoreinfo.com/research/papers/whitepapers/Intel-detect-topology.pdf + */ +void +detect_cpu_topology(void) +{ + static int topology_detected = 0; + int count = 0; + + if (topology_detected) { + goto OUT; + } + + if ((cpu_feature & CPUID_HTT) == 0) { + core_bits = 0; + logical_CPU_bits = 0; + goto OUT; + } else { + count = (cpu_procinfo & CPUID_HTT_CORES) + >> CPUID_HTT_CORE_SHIFT; + } + + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + detect_intel_topology(count); + } else if (cpu_vendor_id == CPU_VENDOR_AMD) { + detect_amd_topology(count); + } + +OUT: + if (bootverbose) + kprintf("BITS within APICID: logical_CPU_bits: %d; core_bits: %d\n", + logical_CPU_bits, core_bits); + + topology_detected = 1; +} + +/* Interface functions to calculate chip_ID, + * core_number and logical_number + * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) + */ +int +get_chip_ID(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) >> + (logical_CPU_bits + core_bits); +} + +int +get_core_number_within_chip(int cpuid) +{ + return (get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & + ( (1 << core_bits) -1); +} + +int +get_logical_CPU_number_within_core(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) & + ( (1 << logical_CPU_bits) -1); +} diff --git a/sys/sys/cpu_topology.h b/sys/sys/cpu_topology.h new file mode 100644 index 0000000..4aff0dc --- /dev/null +++ b/sys/sys/cpu_topology.h @@ -0,0 +1,40 @@ +#ifndef _CPU_TOPOLOGY_H_ +#define _CPU_TOPOLOGY_H_ + +#ifdef _KERNEL + +/* CPU TOPOLOGY DATA AND FUNCTIONS */ +struct cpu_node { + struct cpu_node * parent_node; + struct cpu_node * child_node; + uint32_t child_no; + cpumask_t members; + uint8_t type; +}; +typedef struct cpu_node cpu_node_t; + +extern int cpu_topology_levels_number; + +cpumask_t get_cpumask_from_level(int cpuid, + uint8_t level_type); + +cpu_node_t * +get_cpu_node_by_cpuid(int cpuid); + +#define LEVEL_NO 4 + +/* Level type for CPU siblings */ +#define PACKAGE_LEVEL 1 +#define CHIP_LEVEL 2 +#define CORE_LEVEL 3 +#define THREAD_LEVEL 4 + +#define CPU_ISSET(n, p) ((CPUMASK(n) & p) != 0) + +#define CPUSET_FOREACH(cpu, mask) \ + for ((cpu) = 0; (cpu) < ncpus; (cpu)++) \ + if (CPU_ISSET(cpu, mask)) + + +#endif /* _KERNEL */ +#endif /* _CPU_TOPOLOGY_H_ */ diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index fa5b90d..e2f7ccd 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -148,6 +148,7 @@ enum sysinit_sub_id { SI_BOOT2_BIOS = 0x1d00000, SI_BOOT2_MACHDEP = 0x1d80000, SI_BOOT2_KLD = 0x1e00000, + SI_BOOT2_CPU_TOPOLOGY = 0x1e40000, SI_BOOT2_USCHED = 0x1e80000, SI_BOOT2_PROC0 = 0x1f00000, commit 9bea61140575a5730974980a453d1b7b4057b3e4 Author: Mihai Carabas Date: Wed Aug 22 10:03:12 2012 +0000 vkernel{,64} - CPU topology support * Part of "Add SMT/HT awareness to DragonFly BSD scheduler" GSoC project. * Details at: http://leaf.dragonflybsd.org/mailarchive/kernel/2012-08/msg00009.html Mentored-by: Alex Hornung (alexh@) Sponsored-by: Google Summer of Code 2012 diff --git a/share/man/man7/vkernel.7 b/share/man/man7/vkernel.7 index faae0b3..5fa6cb4 100644 --- a/share/man/man7/vkernel.7 +++ b/share/man/man7/vkernel.7 @@ -53,7 +53,7 @@ .Op Fl I Ar interface Ns Op Ar :address1 Ns Oo Ar :address2 Oc Ns Oo Ar /netmask Oc .Op Fl l Ar cpulock .Op Fl m Ar size -.Op Fl n Ar numcpus +.Op Fl n Ar numcpus Ns Op Ar :lbits Ns Oo Ar :cbits Oc .Op Fl p Ar pidfile .Op Fl r Ar file .Sh DESCRIPTION @@ -190,12 +190,25 @@ Lowercase versions of and .Cm G are allowed. -.It Fl n Ar numcpus -Specify the number of CPUs you wish to emulate. +.It Fl n Ar numcpus Ns Op Ar :lbits Ns Oo Ar :cbits Oc +.Ar numcpus +specifies the number of CPUs you wish to emulate. Up to 16 CPUs are supported. The virtual kernel must be built with .Cd options SMP to use this option and will default to 2 CPUs unless otherwise specified. +.Ar lbits +specifies the number of bits within APICID(=CPUID) needed for representing +the logical ID. +Controls the number of threads/core (0bits - 1 thread, 1bit - 2 threads). +This parameter is optional (mandatory only if +.Ar cbits +is specified). +.Ar cbits +specifies the number of bits within APICID(=CPUID) needed for representing +the core ID. +Controls the number of core/package (0bits - 1 core, 1bit - 2 cores). +This parameter is optional. .It Fl p Ar pidfile Specify a pidfile in which to store the process ID. Scripts can use this file to locate the vkernel pid for the purpose of diff --git a/sys/platform/vkernel/i386/mp.c b/sys/platform/vkernel/i386/mp.c index a1e459f..ffafd8c 100644 --- a/sys/platform/vkernel/i386/mp.c +++ b/sys/platform/vkernel/i386/mp.c @@ -68,6 +68,10 @@ static cpumask_t smp_startup_mask = 1; /* which cpus have been started */ int mp_naps; /* # of Applications processors */ static int mp_finish; +/* Local data for detecting CPU TOPOLOGY */ +static int core_bits = 0; +static int logical_CPU_bits = 0; + /* function prototypes XXX these should go elsewhere */ void bootstrap_idle(void); void single_cpu_ipi(int, int, int); @@ -455,3 +459,35 @@ start_all_aps(u_int boot_addr) return(ncpus - 1); } + +/* + * CPU TOPOLOGY DETECTION FUNCTIONS. + */ + +void +detect_cpu_topology(void) +{ + logical_CPU_bits = vkernel_b_arg; + core_bits = vkernel_B_arg; +} + +int +get_chip_ID(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) >> + (logical_CPU_bits + core_bits); +} + +int +get_core_number_within_chip(int cpuid) +{ + return (get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & + ( (1 << core_bits) -1); +} + +int +get_logical_CPU_number_within_core(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) & + ( (1 << logical_CPU_bits) -1); +} diff --git a/sys/platform/vkernel/include/smp.h b/sys/platform/vkernel/include/smp.h index 0a0b416..2df73db 100644 --- a/sys/platform/vkernel/include/smp.h +++ b/sys/platform/vkernel/include/smp.h @@ -51,6 +51,8 @@ void bootMP (void); /* global data in apic_vector.s */ extern volatile cpumask_t stopped_cpus; extern int optcpus; /* from main() */ +extern int vkernel_b_arg; /* arg from main() */ +extern int vkernel_B_arg; /* arg from main() */ #if 0 extern volatile cpumask_t started_cpus; @@ -165,6 +167,18 @@ int cpu_send_ipiq_passive (int); /* global data in init_smp.c */ extern cpumask_t smp_active_mask; +/* Detect CPU topology bits */ +void detect_cpu_topology(void); + +/* Interface functions for IDs calculation */ +int get_chip_ID(int cpuid); +int get_core_number_within_chip(int cpuid); +int get_logical_CPU_number_within_core(int cpuid); + +/* Assume that APICID = CPUID for virtual processors */ +#define get_cpuid_from_apicid(cpuid) cpuid +#define get_apicid_from_cpuid(cpuid) cpuid + #endif /* !LOCORE */ #else /* !SMP */ diff --git a/sys/platform/vkernel/platform/init.c b/sys/platform/vkernel/platform/init.c index 5b649ed..6acae97 100644 --- a/sys/platform/vkernel/platform/init.c +++ b/sys/platform/vkernel/platform/init.c @@ -112,6 +112,8 @@ int optcpus; /* number of cpus - see mp_start() */ int lwp_cpu_lock; /* if/how to lock virtual CPUs to real CPUs */ int real_ncpus; /* number of real CPUs */ int next_cpu; /* next real CPU to lock a virtual CPU to */ +int vkernel_b_arg; /* -b argument - no of logical CPU bits - only SMP */ +int vkernel_B_arg; /* -B argument - no of core bits - only SMP */ int via_feature_xcrypt = 0; /* XXX */ int via_feature_rng = 0; /* XXX */ @@ -149,6 +151,7 @@ main(int ac, char **av) char *suffix; char *endp; char *tmp; + char *tok; int netifFileNum = 0; int diskFileNum = 0; int cdFileNum = 0; @@ -178,6 +181,8 @@ main(int ac, char **av) kernel_mem_readonly = 1; #ifdef SMP optcpus = 2; + vkernel_b_arg = 0; + vkernel_B_arg = 0; #endif lwp_cpu_lock = LCL_NONE; @@ -197,7 +202,7 @@ main(int ac, char **av) if (ac < 2) usage_help(false); - while ((c = getopt(ac, av, "c:hsvl:m:n:r:e:i:p:I:U")) != -1) { + while ((c = getopt(ac, av, "c:hsvl:m:n:r:e:i:p:I:Ub:B:")) != -1) { switch(c) { case 'e': /* @@ -317,19 +322,46 @@ main(int ac, char **av) * This value is set up by mp_start(), don't just * set ncpus here. */ + tok = strtok(optarg, ":"); #ifdef SMP - optcpus = strtol(optarg, NULL, 0); + optcpus = strtol(tok, NULL, 0); if (optcpus < 1 || optcpus > MAXCPU) usage_err("Bad ncpus, valid range is 1-%d", MAXCPU); + + /* :core_bits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + vkernel_b_arg = strtol(tok, NULL, 0); + + /* :logical_CPU_bits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + vkernel_B_arg = strtol(tok, NULL, 0); + } + + } + #else - if (strtol(optarg, NULL, 0) != 1) { + if (strtol(tok, NULL, 0) != 1) { usage_err("You built a UP vkernel, only 1 cpu!"); } + + /* :logical_CPU_bits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + usage_err("You built a UP vkernel. No CPU topology available"); + + /* :core_bits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + usage_err("You built a UP vkernel. No CPU topology available"); + } + } #endif break; case 'p': - pid_file = optarg; + pid_file = optarg; break; case 'U': kernel_mem_readonly = 0; @@ -1310,7 +1342,8 @@ usage_help(_Bool help) { fprintf(stderr, "Usage: %s [-hsUv] [-c file] [-e name=value:name=value:...]\n" "\t[-i file] [-I interface[:address1[:address2][/netmask]]] [-l cpulock]\n" - "\t[-m size] [-n numcpus] [-p file] [-r file]\n", save_av[0]); + "\t[-m size] [-n numcpus[:lbits[:cbits]]]\n" + "\t[-p file] [-r file]\n", save_av[0]); if (help) fprintf(stderr, "\nArguments:\n" @@ -1321,7 +1354,12 @@ usage_help(_Bool help) "\t-I\tCreate a virtual network device.\n" "\t-l\tSpecify which, if any, real CPUs to lock virtual CPUs to.\n" "\t-m\tSpecify the amount of memory to be used by the kernel in bytes.\n" - "\t-n\tSpecify the number of CPUs you wish to emulate.\n" + "\t-n\tSpecify the number of CPUs and the topology you wish to emulate:\n" + "\t \t- numcpus - number of cpus\n" + "\t \t- :lbits - specify the number of bits within APICID(=CPUID) needed for representing\n" + "\t \tthe logical ID. Controls the number of threads/core (0bits - 1 thread, 1bit - 2 threads).\n" + "\t \t- :cbits - specify the number of bits within APICID(=CPUID) needed for representing\n" + "\t \tthe core ID. Controls the number of core/package (0bits - 1 core, 1bit - 2 cores).\n" "\t-p\tSpecify a file in which to store the process ID.\n" "\t-r\tSpecify a R/W disk image file to be used by the kernel.\n" "\t-s\tBoot into single-user mode.\n" diff --git a/sys/platform/vkernel64/include/smp.h b/sys/platform/vkernel64/include/smp.h index b84d0a4..1156c5b 100644 --- a/sys/platform/vkernel64/include/smp.h +++ b/sys/platform/vkernel64/include/smp.h @@ -51,6 +51,8 @@ void bootMP (void); /* global data in apic_vector.s */ extern volatile cpumask_t stopped_cpus; extern int optcpus; /* from main() */ +extern int vkernel_b_arg; /* arg from main() */ +extern int vkernel_B_arg; /* arg from main() */ #if 0 extern volatile cpumask_t started_cpus; @@ -165,6 +167,18 @@ int cpu_send_ipiq_passive (int); /* global data in init_smp.c */ extern cpumask_t smp_active_mask; +/* Detect CPU topology bits */ +void detect_cpu_topology(void); + +/* Interface functions for IDs calculation */ +int get_chip_ID(int cpuid); +int get_core_number_within_chip(int cpuid); +int get_logical_CPU_number_within_core(int cpuid); + +/* Assume that APICID = CPUID for virtual processors */ +#define get_cpuid_from_apicid(cpuid) cpuid +#define get_apicid_from_cpuid(cpuid) cpuid + #endif /* !LOCORE */ #else /* !SMP */ diff --git a/sys/platform/vkernel64/platform/init.c b/sys/platform/vkernel64/platform/init.c index 7be297a..a8c984d 100644 --- a/sys/platform/vkernel64/platform/init.c +++ b/sys/platform/vkernel64/platform/init.c @@ -112,6 +112,8 @@ int optcpus; /* number of cpus - see mp_start() */ int lwp_cpu_lock; /* if/how to lock virtual CPUs to real CPUs */ int real_ncpus; /* number of real CPUs */ int next_cpu; /* next real CPU to lock a virtual CPU to */ +int vkernel_b_arg; /* -b argument - no of logical CPU bits - only SMP */ +int vkernel_B_arg; /* -B argument - no of core bits - only SMP */ struct privatespace *CPU_prvspace; @@ -146,6 +148,7 @@ main(int ac, char **av) char *suffix; char *endp; char *tmp; + char *tok; int netifFileNum = 0; int diskFileNum = 0; int cdFileNum = 0; @@ -175,6 +178,8 @@ main(int ac, char **av) kernel_mem_readonly = 1; #ifdef SMP optcpus = 2; + vkernel_b_arg = 0; + vkernel_B_arg = 0; #endif lwp_cpu_lock = LCL_NONE; @@ -194,7 +199,7 @@ main(int ac, char **av) if (ac < 2) usage_help(false); - while ((c = getopt(ac, av, "c:hsvl:m:n:r:e:i:p:I:U")) != -1) { + while ((c = getopt(ac, av, "c:hsvl:m:n:r:e:i:p:I:Ub:B:")) != -1) { switch(c) { case 'e': /* @@ -314,17 +319,44 @@ main(int ac, char **av) * This value is set up by mp_start(), don't just * set ncpus here. */ + tok = strtok(optarg, ":"); #ifdef SMP - optcpus = strtol(optarg, NULL, 0); + optcpus = strtol(tok, NULL, 0); if (optcpus < 1 || optcpus > MAXCPU) usage_err("Bad ncpus, valid range is 1-%d", MAXCPU); + + /* :lbits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + vkernel_b_arg = strtol(tok, NULL, 0); + + /* :cbits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + vkernel_B_arg = strtol(tok, NULL, 0); + } + + } + #else - if (strtol(optarg, NULL, 0) != 1) { + if (strtol(tok, NULL, 0) != 1) { usage_err("You built a UP vkernel, only 1 cpu!"); } + + /* :lbits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + usage_err("You built a UP vkernel. No CPU topology available"); + + /* :cbits argument */ + tok = strtok(NULL, ":"); + if (tok != NULL) { + usage_err("You built a UP vkernel. No CPU topology available"); + } + } #endif - break; + case 'p': pid_file = optarg; break; @@ -1271,7 +1303,8 @@ usage_help(_Bool help) { fprintf(stderr, "Usage: %s [-hsUv] [-c file] [-e name=value:name=value:...]\n" "\t[-i file] [-I interface[:address1[:address2][/netmask]]] [-l cpulock]\n" - "\t[-m size] [-n numcpus] [-p file] [-r file]\n", save_av[0]); + "\t[-m size] [-n numcpus[:lbits[:cbits]]]\n" + "\t[-p file] [-r file]\n", save_av[0]); if (help) fprintf(stderr, "\nArguments:\n" @@ -1282,7 +1315,12 @@ usage_help(_Bool help) "\t-I\tCreate a virtual network device.\n" "\t-l\tSpecify which, if any, real CPUs to lock virtual CPUs to.\n" "\t-m\tSpecify the amount of memory to be used by the kernel in bytes.\n" - "\t-n\tSpecify the number of CPUs you wish to emulate.\n" + "\t-n\tSpecify the number of CPUs and the topology you wish to emulate:\n" + "\t \t- numcpus - number of cpus\n" + "\t \t- :lbits - specify the number of bits within APICID(=CPUID) needed for representing\n" + "\t \t the logical ID. Controls the number of threads/core (0bits - 1 thread, 1bit - 2 threads).\n" + "\t \t- :cbits - specify the number of bits within APICID(=CPUID) needed for representing\n" + "\t \t the core ID. Controls the number of core/package (0bits - 1 core, 1bit - 2 cores).\n" "\t-p\tSpecify a file in which to store the process ID.\n" "\t-r\tSpecify a R/W disk image file to be used by the kernel.\n" "\t-s\tBoot into single-user mode.\n" diff --git a/sys/platform/vkernel64/x86_64/mp.c b/sys/platform/vkernel64/x86_64/mp.c index 7358034..3da0394 100644 --- a/sys/platform/vkernel64/x86_64/mp.c +++ b/sys/platform/vkernel64/x86_64/mp.c @@ -68,6 +68,10 @@ static cpumask_t smp_startup_mask = 1; /* which cpus have been started */ int mp_naps; /* # of Applications processors */ static int mp_finish; +/* Local data for detecting CPU TOPOLOGY */ +static int core_bits = 0; +static int logical_CPU_bits = 0; + /* function prototypes XXX these should go elsewhere */ void bootstrap_idle(void); void single_cpu_ipi(int, int, int); @@ -457,3 +461,36 @@ start_all_aps(u_int boot_addr) return(ncpus - 1); } + +/* + * CPU TOPOLOGY DETECTION FUNCTIONS. + */ + +void +detect_cpu_topology(void) +{ + logical_CPU_bits = vkernel_b_arg; + core_bits = vkernel_B_arg; +} + +int +get_chip_ID(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) >> + (logical_CPU_bits + core_bits); +} + +int +get_core_number_within_chip(int cpuid) +{ + return (get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & + ( (1 << core_bits) -1); +} + +int +get_logical_CPU_number_within_core(int cpuid) +{ + return get_apicid_from_cpuid(cpuid) & + ( (1 << logical_CPU_bits) -1); +} + commit d6d39bc7221f794c45470c5d10267d321ca16677 Author: Mihai Carabas Date: Wed Aug 22 10:03:45 2012 +0000 usched_bsd4 - Topology-aware scheduling * Part of "Add SMT/HT awareness to DragonFly BSD scheduler" GSoC project. * Details at: http://leaf.dragonflybsd.org/mailarchive/kernel/2012-08/msg00009.html Mentored-by: Alex Hornung (alexh@) Sponsored-by: Google Summer of Code 2012 diff --git a/sys/conf/options b/sys/conf/options index 052eb09..2a25ffa 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -601,6 +601,7 @@ KTR_TOKENS opt_ktr.h KTR_TSLEEP opt_ktr.h KTR_USB_MEMORY opt_ktr.h KTR_VERBOSE opt_ktr.h +KTR_USCHED_BSD4 opt_ktr.h # NTFS options NTFS_DEBUG opt_ntfs.h diff --git a/sys/config/LINT b/sys/config/LINT index 81a9b4d..694176a 100644 --- a/sys/config/LINT +++ b/sys/config/LINT @@ -2580,6 +2580,7 @@ options KTR_VERBOSE=1 #options KTR_TOKENS #options KTR_TSLEEP #options KTR_USB_MEMORY +#options KTR_USCHED_BSD4 # ALTQ options ALTQ #alternate queueing diff --git a/sys/config/LINT64 b/sys/config/LINT64 index 62f736b..5ac7a51 100644 --- a/sys/config/LINT64 +++ b/sys/config/LINT64 @@ -2342,6 +2342,7 @@ options KTR_VERBOSE=1 #options KTR_TOKENS #options KTR_TSLEEP #options KTR_USB_MEMORY +#options KTR_USCHED_BSD4 # ALTQ options ALTQ #alternate queueing diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 1933d0a..fccc361 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -244,6 +244,7 @@ int ticks; /* system master ticks at hz */ int clocks_running; /* tsleep/timeout clocks operational */ int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */ int64_t nsec_acc; /* accumulator */ +int sched_ticks; /* global schedule clock ticks */ /* NTPD time correction fields */ int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */ @@ -800,6 +801,9 @@ schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) } } } + /* Increment the global sched_ticks */ + if (mycpu->gd_cpuid == 0) + ++sched_ticks; } /* diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c index 96e557a..52fb546 100644 --- a/sys/kern/usched_bsd4.c +++ b/sys/kern/usched_bsd4.c @@ -35,13 +35,16 @@ #include #include #include -#include -#include - +#include #include #include #include +#include + +#include +#include + /* * Priorities. Note that with 32 run queues per scheduler each queue * represents four priority levels. @@ -95,6 +98,8 @@ static void bsd4_yield(struct lwp *lp); #ifdef SMP static void need_user_resched_remote(void *dummy); +static int batchy_looser_pri_test(struct lwp* lp); +static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp); #endif static struct lwp *chooseproc_locked(struct lwp *chklp); static void bsd4_remrunqueue_locked(struct lwp *lp); @@ -118,10 +123,14 @@ struct usched usched_bsd4 = { }; struct usched_bsd4_pcpu { - struct thread helper_thread; - short rrcount; - short upri; - struct lwp *uschedcp; + struct thread helper_thread; + short rrcount; + short upri; + struct lwp *uschedcp; + struct lwp *old_uschedcp; +#ifdef SMP + cpu_node_t *cpunode; +#endif }; typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; @@ -152,6 +161,10 @@ static volatile int bsd4_scancpu; #endif static struct spinlock bsd4_spin; static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; +static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; +static struct sysctl_oid *usched_bsd4_sysctl_tree; + +/* Debug info exposed through debug.* sysctl */ SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, "Number of run queues"); @@ -163,9 +176,14 @@ static int usched_optimal; SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, &usched_optimal, 0, "acquire_curproc() was optimal"); #endif -static int usched_debug = -1; -SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, + +static int usched_bsd4_debug = -1; +SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0, "Print debug information for this pid"); +static int usched_bsd4_pid_debug = -1; +SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0, + "Print KTR debug information for this pid"); + #ifdef SMP static int remote_resched_nonaffinity; static int remote_resched_affinity; @@ -178,15 +196,116 @@ SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, &choose_affinity, 0, "chooseproc() was smart"); #endif + +/* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ +#ifdef SMP +static int usched_bsd4_smt = 0; +static int usched_bsd4_cache_coherent = 0; +static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ +static int usched_bsd4_queue_checks = 5; +static int usched_bsd4_stick_to_level = 0; +#endif static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; -SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, - &usched_bsd4_rrinterval, 0, ""); static int usched_bsd4_decay = 8; -SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, - &usched_bsd4_decay, 0, "Extra decay when not running"); static int usched_bsd4_batch_time = 10; -SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW, - &usched_bsd4_batch_time, 0, "Minimum batch counter value"); + +/* KTR debug printings */ + +KTR_INFO_MASTER(usched); + +#if !defined(KTR_USCHED_BSD4) +#define KTR_USCHED_BSD4 KTR_ALL +#endif + +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, + "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " + "after release: pid %d, cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, + "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " + "curr_cpuid %d)", + pid_t pid, int cpuid, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, + "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " + "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", + pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, + "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " + "cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, int curr); + +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, + "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " + "cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, int curr); + +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, + "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " + "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); + +#ifdef SMP +KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, + "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " + "cpuid %d, verify_mask %lu)", + pid_t pid, int cpuid, cpumask_t mask); +KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, + "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " + "cpuid %d, verify_mask %lu)", + pid_t pid, int cpuid, cpumask_t mask); + +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, + "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " + "mask %lu, curr_cpuid %d)", + pid_t pid, int cpuid, cpumask_t mask, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, + "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " + "cpuid %d, mask %lu, curr_cpuid %d)", + pid_t pid, int cpuid, cpumask_t mask, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, + "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " + "cpuid %d, mask %lu, curr_cpuid %d)", + pid_t pid, int cpuid, cpumask_t mask, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, + "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " + "mask %lu, found_cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, + "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " + "try_cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, int try_cpuid, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, + "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " + "mask %lu, found_cpuid %d, curr_cpuid %d)", + pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); +#endif + +KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0, + "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", + pid_t pid, int old_cpuid, int curr); +#ifdef SMP +KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, + "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", + pid_t pid, int old_cpuid, int curr); +KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, + "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " + "sibling_mask %lu, curr_cpumask %lu)", + pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); +KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, + "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " + "sibling_mask %lu, curr_cpumask: %lu)", + pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); + +KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, + "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", + int id, pid_t pid, int cpuid); +KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, + "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", + int id, pid_t pid, int cpuid); +KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, + "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", + int id, cpumask_t tmpmask); +#endif /* * Initialize the run queues at boot time. @@ -248,6 +367,12 @@ bsd4_acquire_curproc(struct lwp *lp) if (user_resched_wanted()) { clear_user_resched(); bsd4_release_curproc(lp); + + KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mycpu->gd_cpuid); } /* @@ -256,6 +381,12 @@ bsd4_acquire_curproc(struct lwp *lp) gd = mycpu; dd = &bsd4_pcpu[gd->gd_cpuid]; + KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + gd->gd_cpuid); + do { /* * Process any pending events and higher priority threads. @@ -303,14 +434,31 @@ bsd4_acquire_curproc(struct lwp *lp) * chance. */ lwkt_deschedule(lp->lwp_thread); + bsd4_setrunqueue(lp); + + KTR_COND_LOG(usched_bsd4_acquire_curproc_not, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + dd->uschedcp->lwp_proc->p_pid, + gd->gd_cpuid); + + lwkt_switch(); + /* * Reload after a switch or setrunqueue/switch possibly * moved us to another cpu. */ gd = mycpu; dd = &bsd4_pcpu[gd->gd_cpuid]; + + KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + gd->gd_cpuid); } } while (dd->uschedcp != lp); @@ -338,6 +486,7 @@ bsd4_acquire_curproc(struct lwp *lp) * * MPSAFE */ + static void bsd4_release_curproc(struct lwp *lp) { @@ -347,9 +496,17 @@ bsd4_release_curproc(struct lwp *lp) if (dd->uschedcp == lp) { crit_enter(); KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); + + KTR_COND_LOG(usched_bsd4_release_curproc, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + gd->gd_cpuid); + dd->uschedcp = NULL; /* don't let lp be selected */ dd->upri = PRIBASE_NULL; atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); + dd->old_uschedcp = lp; /* used only for KTR debug prints */ bsd4_select_curproc(gd); crit_exit(); } @@ -381,7 +538,23 @@ bsd4_select_curproc(globaldata_t gd) crit_enter_gd(gd); spin_lock(&bsd4_spin); - if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { +#ifdef SMP + if(usched_bsd4_cache_coherent) + nlp = chooseproc_locked_cache_coherent(dd->uschedcp); + else +#endif + nlp = chooseproc_locked(dd->uschedcp); + + if (nlp) { + + KTR_COND_LOG(usched_bsd4_select_curproc, + nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, + nlp->lwp_proc->p_pid, + nlp->lwp_thread->td_gd->gd_cpuid, + dd->old_uschedcp->lwp_proc->p_pid, + dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, + gd->gd_cpuid); + atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); dd->upri = nlp->lwp_priority; dd->uschedcp = nlp; @@ -393,6 +566,7 @@ bsd4_select_curproc(globaldata_t gd) } else { spin_unlock(&bsd4_spin); } + #if 0 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); @@ -404,8 +578,51 @@ bsd4_select_curproc(globaldata_t gd) #endif crit_exit_gd(gd); } +#ifdef SMP + +/* + * batchy_looser_pri_test() - determine if a process is batchy or not + * relative to the other processes running in the system + */ +static int +batchy_looser_pri_test(struct lwp* lp) +{ + cpumask_t mask; + bsd4_pcpu_t other_dd; + int cpu; + + /* Current running processes */ + mask = bsd4_curprocmask & smp_active_mask + & usched_global_cpumask; + + while(mask) { + cpu = BSFCPUMASK(mask); + other_dd = &bsd4_pcpu[cpu]; + if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { + + KTR_COND_LOG(usched_batchy_test_false, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask); + + return 0; + } + mask &= ~CPUMASK(cpu); + } + + KTR_COND_LOG(usched_batchy_test_true, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask); + + return 1; +} +#endif /* + * * BSD4_SETRUNQUEUE * * Place the specified lwp on the user scheduler's run queue. This routine @@ -490,6 +707,7 @@ bsd4_setrunqueue(struct lwp *lp) */ spin_lock(&bsd4_spin); bsd4_setrunqueue_locked(lp); + lp->lwp_setrunqueue_ticks = sched_ticks; #ifdef SMP /* @@ -502,22 +720,113 @@ bsd4_setrunqueue(struct lwp *lp) * process. */ ++bsd4_scancpu; - cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; - mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & - smp_active_mask & usched_global_cpumask; - while (mask) { - tmpmask = ~(CPUMASK(cpuid) - 1); - if (mask & tmpmask) - cpuid = BSFCPUMASK(mask & tmpmask); - else - cpuid = BSFCPUMASK(mask); - gd = globaldata_find(cpuid); - dd = &bsd4_pcpu[cpuid]; + if(usched_bsd4_smt) { + + /* + * SMT heuristic - Try to schedule on a free physical core. If no physical core + * found than choose the one that has an interactive thread + */ + + int best_cpuid = -1; + int min_prio = MAXPRI * MAXPRI; + int sibling; + + cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; + mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & + smp_active_mask & usched_global_cpumask; + + KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + mycpu->gd_cpuid); + + while (mask) { + tmpmask = ~(CPUMASK(cpuid) - 1); + if (mask & tmpmask) + cpuid = BSFCPUMASK(mask & tmpmask); + else + cpuid = BSFCPUMASK(mask); + gd = globaldata_find(cpuid); + dd = &bsd4_pcpu[cpuid]; + + if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { + if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) { + + KTR_COND_LOG(usched_bsd4_setrunqueue_found, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + cpuid, + mycpu->gd_cpuid); + + goto found; + } else { + sibling = BSFCPUMASK(dd->cpunode->parent_node->members & + ~dd->cpunode->members); + if (min_prio > bsd4_pcpu[sibling].upri) { + min_prio = bsd4_pcpu[sibling].upri; + best_cpuid = cpuid; + } + } + } + mask &= ~CPUMASK(cpuid); + } + + if (best_cpuid != -1) { + cpuid = best_cpuid; + gd = globaldata_find(cpuid); + dd = &bsd4_pcpu[cpuid]; + + KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + cpuid, + mycpu->gd_cpuid); - if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) goto found; - mask &= ~CPUMASK(cpuid); + } + } else { + /* Fallback to the original heuristic */ + cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; + mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & + smp_active_mask & usched_global_cpumask; + + KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + mycpu->gd_cpuid); + + while (mask) { + tmpmask = ~(CPUMASK(cpuid) - 1); + if (mask & tmpmask) + cpuid = BSFCPUMASK(mask & tmpmask); + else + cpuid = BSFCPUMASK(mask); + gd = globaldata_find(cpuid); + dd = &bsd4_pcpu[cpuid]; + + if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { + + KTR_COND_LOG(usched_bsd4_setrunqueue_found, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + cpuid, + mycpu->gd_cpuid); + + goto found; + } + mask &= ~CPUMASK(cpuid); + } } /* @@ -526,6 +835,13 @@ bsd4_setrunqueue(struct lwp *lp) mask = bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; + KTR_COND_LOG(usched_bsd4_setrunqueue_rc, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + mycpu->gd_cpuid); + while (mask) { tmpmask = ~(CPUMASK(cpuid) - 1); if (mask & tmpmask) @@ -535,8 +851,18 @@ bsd4_setrunqueue(struct lwp *lp) gd = globaldata_find(cpuid); dd = &bsd4_pcpu[cpuid]; - if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) + if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { + + KTR_COND_LOG(usched_bsd4_setrunqueue_found, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mask, + cpuid, + mycpu->gd_cpuid); + goto found; + } mask &= ~CPUMASK(cpuid); } @@ -557,12 +883,20 @@ bsd4_setrunqueue(struct lwp *lp) } gd = globaldata_find(cpuid); dd = &bsd4_pcpu[cpuid]; + + KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + cpuid, + mycpu->gd_cpuid); + found: if (gd == mycpu) { spin_unlock(&bsd4_spin); if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { if (dd->uschedcp == NULL) { - lwkt_schedule(&dd->helper_thread); + wakeup(&dd->helper_thread); } else { need_user_resched(); } @@ -573,7 +907,7 @@ found: if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) lwkt_send_ipiq(gd, need_user_resched_remote, NULL); else - lwkt_schedule(&dd->helper_thread); + wakeup(&dd->helper_thread); } #else /* @@ -633,7 +967,7 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) /* * Called from acquire and from kern_synch's one-second timer (one of the - * callout helper threads) with a critical section held. + * callout helper threads) with a critical section held. * * Decay p_estcpu based on the number of ticks we haven't been running * and our p_nice. As the load increases each process observes a larger @@ -649,7 +983,7 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) * MPSAFE */ static -void +void bsd4_recalculate_estcpu(struct lwp *lp) { globaldata_t gd = mycpu; @@ -681,8 +1015,8 @@ bsd4_recalculate_estcpu(struct lwp *lp) } else if (lp->lwp_cpbase != cpbase) { /* * Adjust estcpu if we are in a different tick. Don't waste - * time if we are in the same tick. - * + * time if we are in the same tick. + * * First calculate the number of ticks in the measurement * interval. The ttlticks calculation can wind up 0 due to * a bug in the handling of lwp_slptime (as yet not found), @@ -730,7 +1064,7 @@ bsd4_recalculate_estcpu(struct lwp *lp) lp->lwp_batch = 0; } - if (usched_debug == lp->lwp_proc->p_pid) { + if (usched_bsd4_debug == lp->lwp_proc->p_pid) { kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", lp->lwp_proc->p_pid, lp, estcpu, lp->lwp_estcpu, @@ -763,7 +1097,7 @@ bsd4_recalculate_estcpu(struct lwp *lp) (lp->lwp_estcpu * decay_factor + estcpu) / (decay_factor + 1)); - if (usched_debug == lp->lwp_proc->p_pid) + if (usched_bsd4_debug == lp->lwp_proc->p_pid) kprintf(" finalestcpu %d\n", lp->lwp_estcpu); bsd4_resetpriority(lp); lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; @@ -914,7 +1248,7 @@ bsd4_resetpriority(struct lwp *lp) */ static void -bsd4_yield(struct lwp *lp) +bsd4_yield(struct lwp *lp) { #if 0 /* FUTURE (or something similar) */ @@ -1005,6 +1339,7 @@ chooseproc_locked(struct lwp *chklp) idqbits = bsd4_idqueuebits; cpumask = mycpu->gd_cpumask; + #ifdef SMP again: #endif @@ -1042,7 +1377,7 @@ again: /* * If the passed lwp is reasonably close to the selected * lwp , return NULL (indicating that should be kept). - * + * * Note that we must error on the side of to avoid bouncing * between threads in the acquire code. */ @@ -1068,6 +1403,12 @@ again: } #endif + KTR_COND_LOG(usched_chooseproc, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mycpu->gd_cpuid); + TAILQ_REMOVE(q, lp, lwp_procq); --bsd4_runqcount; if (TAILQ_EMPTY(q)) @@ -1078,6 +1419,160 @@ again: } #ifdef SMP +/* + * chooseproc() - with a cache coherence heuristic. Try to pull a process that + * has its home on the current CPU> If the process doesn't have its home here + * and is a batchy one (see batcy_looser_pri_test), we can wait for a + * sched_tick, may be its home will become free and pull it in. Anyway, + * we can't wait more than one tick. If that tick expired, we pull in that + * process, no matter what. + */ +static +struct lwp * +chooseproc_locked_cache_coherent(struct lwp *chklp) +{ + struct lwp *lp; + struct rq *q; + u_int32_t *which, *which2; + u_int32_t pri; + u_int32_t checks; + u_int32_t rtqbits; + u_int32_t tsqbits; + u_int32_t idqbits; + cpumask_t cpumask; + + struct lwp * min_level_lwp = NULL; + struct rq *min_q = NULL; + cpumask_t siblings; + cpu_node_t* cpunode = NULL; + u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ + u_int32_t *min_which = NULL; + u_int32_t min_pri = 0; + u_int32_t level = 0; + + rtqbits = bsd4_rtqueuebits; + tsqbits = bsd4_queuebits; + idqbits = bsd4_idqueuebits; + cpumask = mycpu->gd_cpumask; + + /* Get the mask coresponding to the sysctl configured level */ + cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; + level = usched_bsd4_stick_to_level; + while (level) { + cpunode = cpunode->parent_node; + level--; + } + /* The cpus which can ellect a process */ + siblings = cpunode->members; + +again: + if (rtqbits) { + pri = bsfl(rtqbits); + q = &bsd4_rtqueues[pri]; + which = &bsd4_rtqueuebits; + which2 = &rtqbits; + } else if (tsqbits) { + pri = bsfl(tsqbits); + q = &bsd4_queues[pri]; + which = &bsd4_queuebits; + which2 = &tsqbits; + } else if (idqbits) { + pri = bsfl(idqbits); + q = &bsd4_idqueues[pri]; + which = &bsd4_idqueuebits; + which2 = &idqbits; + } else { + return NULL; + } + lp = TAILQ_FIRST(q); + KASSERT(lp, ("chooseproc: no lwp on busy queue")); + + /* Limit the number of checks/queue to a configurable value to + * minimize the contention (we are in a locked region + */ + for (checks = 0; checks < usched_bsd4_queue_checks; checks++) { + + if ((lp->lwp_cpumask & cpumask) == 0 || + ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 && + batchy_looser_pri_test(lp) && + (lp->lwp_setrunqueue_ticks == sched_ticks || + lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) { + + KTR_COND_LOG(usched_chooseproc_cc_not_good, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpumask, + siblings, + cpumask); + + cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; + level = 0; + while (cpunode) { + if (cpunode->members & cpumask) { + break; + } + cpunode = cpunode->parent_node; + level++; + } + if (level < min_level) { + min_level_lwp = lp; + min_level = level; + min_q = q; + min_which = which; + min_pri = pri; + } + + lp = TAILQ_NEXT(lp, lwp_procq); + if (lp == NULL) { + *which2 &= ~(1 << pri); + goto again; + } + } else { + KTR_COND_LOG(usched_chooseproc_cc_elected, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpumask, + siblings, + cpumask); + + goto found; + } + } + lp = min_level_lwp; + q = min_q; + which = min_which; + pri = min_pri; + KASSERT(lp, ("chooseproc: at least the first lp was good")); + +found: + + /* + * If the passed lwp is reasonably close to the selected + * lwp , return NULL (indicating that should be kept). + * + * Note that we must error on the side of to avoid bouncing + * between threads in the acquire code. + */ + if (chklp) { + if (chklp->lwp_priority < lp->lwp_priority + PPQ) + return(NULL); + } + + KTR_COND_LOG(usched_chooseproc_cc, + lp->lwp_proc->p_pid == usched_bsd4_pid_debug, + lp->lwp_proc->p_pid, + lp->lwp_thread->td_gd->gd_cpuid, + mycpu->gd_cpuid); + + TAILQ_REMOVE(q, lp, lwp_procq); + --bsd4_runqcount; + if (TAILQ_EMPTY(q)) + *which &= ~(1 << pri); + KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); + atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); + return lp; +} + static void @@ -1087,7 +1582,7 @@ need_user_resched_remote(void *dummy) bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; need_user_resched(); - lwkt_schedule(&dd->helper_thread); + wakeup(&dd->helper_thread); } #endif @@ -1236,14 +1731,18 @@ sched_thread(void *dummy) */ lwkt_setpri_self(TDPRI_USER_SCHEDULER); + tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0); + for (;;) { +//again: /* * We use the LWKT deschedule-interlock trick to avoid racing * bsd4_rdyprocmask. This means we cannot block through to the * manual lwkt_switch() call we make below. */ crit_enter_gd(gd); - lwkt_deschedule_self(gd->gd_curthread); + //lwkt_deschedule_self(gd->gd_curthread); + tsleep_interlock(&dd->helper_thread, 0); spin_lock(&bsd4_spin); atomic_set_cpumask(&bsd4_rdyprocmask, mask); @@ -1256,6 +1755,13 @@ sched_thread(void *dummy) */ KKASSERT(dd->uschedcp == NULL); if ((nlp = chooseproc_locked(NULL)) != NULL) { + + KTR_COND_LOG(usched_sched_thread_no_process, + nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, + gd->gd_cpuid, + nlp->lwp_proc->p_pid, + nlp->lwp_thread->td_gd->gd_cpuid); + atomic_set_cpumask(&bsd4_curprocmask, mask); dd->upri = nlp->lwp_priority; dd->uschedcp = nlp; @@ -1269,6 +1775,13 @@ sched_thread(void *dummy) } } else if (bsd4_runqcount) { if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { + + KTR_COND_LOG(usched_sched_thread_process, + nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, + gd->gd_cpuid, + nlp->lwp_proc->p_pid, + nlp->lwp_thread->td_gd->gd_cpuid); + dd->upri = nlp->lwp_priority; dd->uschedcp = nlp; spin_unlock(&bsd4_spin); @@ -1288,18 +1801,22 @@ sched_thread(void *dummy) * to priority test does not leave other unscheduled * cpus idle when the runqueue is not empty. */ - tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask & - smp_active_mask; + tmpmask = ~bsd4_curprocmask & + bsd4_rdyprocmask & smp_active_mask; if (tmpmask) { tmpid = BSFCPUMASK(tmpmask); tmpdd = &bsd4_pcpu[tmpid]; atomic_clear_cpumask(&bsd4_rdyprocmask, - CPUMASK(tmpid)); + CPUMASK(tmpid)); spin_unlock(&bsd4_spin); - lwkt_schedule(&tmpdd->helper_thread); + wakeup(&tmpdd->helper_thread); } else { spin_unlock(&bsd4_spin); } + + KTR_LOG(usched_sched_thread_no_process_found, + gd->gd_cpuid, + tmpmask); } } else { /* @@ -1314,10 +1831,29 @@ sched_thread(void *dummy) * for us if interrupts and such are pending. */ crit_exit_gd(gd); - lwkt_switch(); + tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0); +// lwkt_switch(); } } +/* sysctl stick_to_level parameter */ +static int +sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) +{ + int error, new_val; + + new_val = usched_bsd4_stick_to_level; + + error = sysctl_handle_int(oidp, &new_val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (new_val > cpu_topology_levels_number - 1 || + new_val < 0) + return (EINVAL); + usched_bsd4_stick_to_level = new_val; + return (0); +} + /* * Setup our scheduler helpers. Note that curprocmask bit 0 has already * been cleared by rqinit() and we should not mess with it further. @@ -1325,38 +1861,184 @@ sched_thread(void *dummy) static void sched_thread_cpu_init(void) { - int i; + int i; + int cpuid; + int smt_not_supported = 0; + int cache_coherent_not_supported = 0; + if (bootverbose) + kprintf("Start scheduler helpers on cpus:\n"); - if (bootverbose) - kprintf("start scheduler helpers on cpus:"); + sysctl_ctx_init(&usched_bsd4_sysctl_ctx); + usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, + "usched_bsd4", CTLFLAG_RD, 0, ""); - for (i = 0; i < ncpus; ++i) { - bsd4_pcpu_t dd = &bsd4_pcpu[i]; - cpumask_t mask = CPUMASK(i); + for (i = 0; i < ncpus; ++i) { + bsd4_pcpu_t dd = &bsd4_pcpu[i]; + cpumask_t mask = CPUMASK(i); - if ((mask & smp_active_mask) == 0) - continue; + if ((mask & smp_active_mask) == 0) + continue; - if (bootverbose) - kprintf(" %d", i); + dd->cpunode = get_cpu_node_by_cpuid(i); - lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, - TDF_NOSTART, i, "usched %d", i); + if (dd->cpunode == NULL) { + smt_not_supported = 1; + cache_coherent_not_supported = 1; + if (bootverbose) + kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i); - /* - * Allow user scheduling on the target cpu. cpu #0 has already - * been enabled in rqinit(). - */ - if (i) - atomic_clear_cpumask(&bsd4_curprocmask, mask); - atomic_set_cpumask(&bsd4_rdyprocmask, mask); - dd->upri = PRIBASE_NULL; - } - if (bootverbose) - kprintf("\n"); + } else { + + switch (dd->cpunode->type) { + case THREAD_LEVEL: + if (bootverbose) + kprintf ("\tcpu%d - HyperThreading available. " + "Core siblings: ", i); + break; + case CORE_LEVEL: + smt_not_supported = 1; + + if (bootverbose) + kprintf ("\tcpu%d - No HT available, multi-core/physical " + "cpu. Physical siblings: ", i); + break; + case CHIP_LEVEL: + smt_not_supported = 1; + + if (bootverbose) + kprintf ("\tcpu%d - No HT available, single-core/physical cpu. " + "Package Siblings: ", i); + break; + default: + if (bootverbose) + kprintf ("\tcpu%d - Unknown cpunode->type. Siblings: ", i); + break; + } + + if (bootverbose) { + if (dd->cpunode->parent_node != NULL) { + CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members) + kprintf("cpu%d ", cpuid); + kprintf("\n"); + } else { + kprintf(" no siblings\n"); + } + } + } + + lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, + 0, i, "usched %d", i); + + /* + * Allow user scheduling on the target cpu. cpu #0 has already + * been enabled in rqinit(). + */ + if (i) + atomic_clear_cpumask(&bsd4_curprocmask, mask); + atomic_set_cpumask(&bsd4_rdyprocmask, mask); + dd->upri = PRIBASE_NULL; + + } + + /* usched_bsd4 sysctl configurable parameters */ + + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "rrinterval", CTLFLAG_RW, + &usched_bsd4_rrinterval, 0, ""); + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "decay", CTLFLAG_RW, + &usched_bsd4_decay, 0, "Extra decay when not running"); + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "batch_time", CTLFLAG_RW, + &usched_bsd4_batch_time, 0, "Minimum batch counter value"); + + /* Add enable/disable option for SMT scheduling if supported */ + if (smt_not_supported) { + usched_bsd4_smt = 0; + SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "smt", CTLFLAG_RD, + "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); + } else { + usched_bsd4_smt = 1; + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "smt", CTLFLAG_RW, + &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling"); + + } + + /* Add enable/disable option for cache coherent scheduling if supported */ + if (cache_coherent_not_supported) { +#ifdef SMP + usched_bsd4_cache_coherent = 0; + SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "cache_coherent", CTLFLAG_RD, + "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED"); +#endif + } else { +#ifdef SMP + usched_bsd4_cache_coherent = 1; + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "cache_coherent", CTLFLAG_RW, + &usched_bsd4_cache_coherent, 0, + "Enable/Disable cache coherent scheduling"); +#endif + + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "upri_affinity", CTLFLAG_RW, + &usched_bsd4_upri_affinity, 1, + "Number of PPQs in user priority check"); + + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "queue_checks", CTLFLAG_RW, + &usched_bsd4_queue_checks, 5, + "Number of LWP to check from a queue before giving up"); + + SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW, + NULL, sizeof usched_bsd4_stick_to_level, + sysctl_usched_bsd4_stick_to_level, "I", + "Stick a process to this level. See sysctl" + "paremter hw.cpu_topology.level_description"); + } } SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, sched_thread_cpu_init, NULL) +#else /* No SMP options - just add the configurable parameters to sysctl */ +static void +sched_sysctl_tree_init(void) +{ + sysctl_ctx_init(&usched_bsd4_sysctl_ctx); + usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, + "usched_bsd4", CTLFLAG_RD, 0, ""); + + /* usched_bsd4 sysctl configurable parameters */ + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "rrinterval", CTLFLAG_RW, + &usched_bsd4_rrinterval, 0, ""); + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "decay", CTLFLAG_RW, + &usched_bsd4_decay, 0, "Extra decay when not running"); + SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, + SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), + OID_AUTO, "batch_time", CTLFLAG_RW, + &usched_bsd4_batch_time, 0, "Minimum batch counter value"); +} +SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, + sched_sysctl_tree_init, NULL) #endif diff --git a/sys/sys/proc.h b/sys/sys/proc.h index ef1857e..4478db3 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -199,6 +199,7 @@ struct lwp { sysclock_t lwp_cpbase; /* Measurement base */ fixpt_t lwp_pctcpu; /* %cpu for this process */ u_int lwp_slptime; /* Time since last blocked. */ + u_int lwp_setrunqueue_ticks; /* Tick count - lwp set on runqueue */ int lwp_traceflag; /* Kernel trace points. */ diff --git a/sys/sys/usched.h b/sys/sys/usched.h index 18bf7be..be4ad93 100644 --- a/sys/sys/usched.h +++ b/sys/sys/usched.h @@ -84,6 +84,7 @@ union usched_data { extern struct usched usched_bsd4; extern struct usched usched_dummy; extern cpumask_t usched_mastermask; +extern int sched_ticks; /* From sys/kern/kern_clock.c */ int usched_ctl(struct usched *, int); struct usched *usched_init(void);