diff --git a/SOURCES/numad-0.5git-m-option.patch b/SOURCES/numad-0.5git-m-option.patch new file mode 100644 index 00000000..8fee75f9 --- /dev/null +++ b/SOURCES/numad-0.5git-m-option.patch @@ -0,0 +1,21 @@ +Resolves: #1506477 + +commit cf6c2c029edc9c288122bcd603a72eb7f6d042d2 +Author: Jan Synacek +Date: Mon Oct 30 11:37:45 2017 +0100 + + recognize -m option correctly + +diff --git a/numad.c b/numad.c +index 4c85486..0721af4 100644 +--- a/numad.c ++++ b/numad.c +@@ -2395,7 +2395,7 @@ int main(int argc, char *argv[]) { + int x_flag = 0; + int tmp_int = 0; + long list_pid = 0; +- while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) { ++ while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:m:p:r:R:S:t:u:vVw:x:")) != -1) { + switch (opt) { + case 'C': + C_flag = 1; diff --git a/SOURCES/numad-0.5git-pthread.patch b/SOURCES/numad-0.5git-pthread.patch new file mode 100644 index 00000000..bb365300 --- /dev/null +++ b/SOURCES/numad-0.5git-pthread.patch @@ -0,0 +1,17 @@ +Remove linked libraries from Makefile. They break rebuilds from srpms on some +architectures. The linker flags are supplied from the spec. + +Author: Jan Synacek +RH-Bugzilla: #825153 + +--- Makefile.orig 2012-09-11 08:29:18.965821127 +0200 ++++ Makefile 2012-09-11 08:29:29.391803358 +0200 +@@ -31,7 +31,7 @@ + + all: numad + +-numad: numad.o -lpthread -lrt ++numad: numad.o + + AR ?= ar + RANLIB ?= ranlib diff --git a/SOURCES/numad-0.5git-update-20140225.patch b/SOURCES/numad-0.5git-update-20140225.patch new file mode 100755 index 00000000..532e776b --- /dev/null +++ b/SOURCES/numad-0.5git-update-20140225.patch @@ -0,0 +1,2255 @@ +--- numad-0.5git/numad.c 2012-12-03 15:40:40.000000000 +0100 ++++ new-rhel7/numad.c 2014-02-27 10:02:58.000000000 +0100 +@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston + */ + + +-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt ++// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm + + + #define _GNU_SOURCE +@@ -54,7 +54,7 @@ Inc., 59 Temple Place, Suite 330, Boston + #include + + +-#define VERSION_STRING "20121130" ++#define VERSION_STRING "20140225" + + + #define VAR_RUN_FILE "/var/run/numad.pid" +@@ -86,15 +86,11 @@ char *cpuset_dir_list[] = { + #define MAX_INTERVAL 15 + #define CPU_THRESHOLD 50 + #define MEMORY_THRESHOLD 300 ++#define THP_SCAN_SLEEP_MS 1000 + #define TARGET_UTILIZATION_PERCENT 85 +-#define IMPROVEMENT_THRESHOLD_PERCENT 5 ++#define DEFAULT_HTT_PERCENT 20 + + +-#define ELIM_NEW_LINE(s) \ +- if (s[strlen(s) - 1] == '\n') { \ +- s[strlen(s) - 1] = '\0'; \ +- } +- + #define CONVERT_DIGITS_TO_NUM(p, n) \ + n = *p++ - '0'; \ + while (isdigit(*p)) { \ +@@ -105,19 +101,36 @@ char *cpuset_dir_list[] = { + + int num_cpus = 0; + int num_nodes = 0; ++int threads_per_core = 0; + int page_size_in_bytes = 0; + int huge_page_size_in_bytes = 0; ++int thp_scan_sleep_ms = THP_SCAN_SLEEP_MS; + + int min_interval = MIN_INTERVAL; + int max_interval = MAX_INTERVAL; ++int htt_percent = DEFAULT_HTT_PERCENT; + int target_utilization = TARGET_UTILIZATION_PERCENT; + int scan_all_processes = 1; + int keep_interleaved_memory = 0; ++int use_inactive_file_cache = 1; + + pthread_mutex_t pid_list_mutex; + pthread_mutex_t node_info_mutex; ++long sum_CPUs_total = 0; + int requested_mbs = 0; + int requested_cpus = 0; ++int got_sighup = 0; ++int got_sigterm = 0; ++int got_sigquit = 0; ++ ++ ++void sig_handler(int signum) { ++ switch (signum) { ++ case SIGHUP: got_sighup = 1; break; ++ case SIGTERM: got_sigterm = 1; break; ++ case SIGQUIT: got_sigquit = 1; break; ++ } ++} + + + +@@ -161,7 +174,9 @@ void open_log_file() { + + void close_log_file() { + if (log_fs != NULL) { +- fclose(log_fs); ++ if (log_fs != stderr) { ++ fclose(log_fs); ++ } + log_fs = NULL; + } + } +@@ -233,7 +248,6 @@ void send_msg(long dst_pid, long cmd, lo + } + + +- + typedef struct id_list { + // Use CPU_SET(3) cpuset bitmasks, + // but bundle size and pointer together +@@ -242,16 +256,22 @@ typedef struct id_list { + size_t bytes; + } id_list_t, *id_list_p; + +-#define INIT_ID_LIST(list_p) \ ++#define INIT_ID_LIST(list_p, num_elements) \ + list_p = malloc(sizeof(id_list_t)); \ + if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \ +- list_p->set_p = CPU_ALLOC(num_cpus); \ ++ list_p->set_p = CPU_ALLOC(num_elements); \ + if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \ +- list_p->bytes = CPU_ALLOC_SIZE(num_cpus); ++ list_p->bytes = CPU_ALLOC_SIZE(num_elements); ++ ++#define CLEAR_CPU_LIST(list_p) \ ++ if (list_p == NULL) { \ ++ INIT_ID_LIST(list_p, num_cpus); \ ++ } \ ++ CPU_ZERO_S(list_p->bytes, list_p->set_p) + +-#define CLEAR_LIST(list_p) \ ++#define CLEAR_NODE_LIST(list_p) \ + if (list_p == NULL) { \ +- INIT_ID_LIST(list_p); \ ++ INIT_ID_LIST(list_p, num_nodes); \ + } \ + CPU_ZERO_S(list_p->bytes, list_p->set_p) + +@@ -262,6 +282,9 @@ typedef struct id_list { + list_p = NULL; \ + } + ++#define COPY_LIST(orig_list_p, copy_list_p) \ ++ memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes) ++ + #define NUM_IDS_IN_LIST(list_p) CPU_COUNT_S(list_p->bytes, list_p->set_p) + #define ADD_ID_TO_LIST(k, list_p) CPU_SET_S(k, list_p->bytes, list_p->set_p) + #define CLR_ID_IN_LIST(k, list_p) CPU_CLR_S(k, list_p->bytes, list_p->set_p) +@@ -272,6 +295,25 @@ typedef struct id_list { + #define OR_LISTS( or_list_p, list_1_p, list_2_p) CPU_OR_S( or_list_p->bytes, or_list_p->set_p, list_1_p->set_p, list_2_p->set_p) + #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p) + ++int negate_list(id_list_p list_p) { ++ if (list_p == NULL) { ++ numad_log(LOG_CRIT, "Cannot negate a NULL list\n"); ++ exit(EXIT_FAILURE); ++ } ++ if (num_cpus < 1) { ++ numad_log(LOG_CRIT, "No CPUs to negate in list!\n"); ++ exit(EXIT_FAILURE); ++ } ++ for (int ix = 0; (ix < num_cpus); ix++) { ++ if (ID_IS_IN_LIST(ix, list_p)) { ++ CLR_ID_IN_LIST(ix, list_p); ++ } else { ++ ADD_ID_TO_LIST(ix, list_p); ++ } ++ } ++ return NUM_IDS_IN_LIST(list_p); ++} ++ + int add_ids_to_list_from_str(id_list_p list_p, char *s) { + if (list_p == NULL) { + numad_log(LOG_CRIT, "Cannot add to NULL list\n"); +@@ -340,6 +382,25 @@ terminate_string: + return (p - str_p); + } + ++id_list_p all_cpus_list_p = NULL; ++id_list_p all_nodes_list_p = NULL; ++char *reserved_cpu_str = NULL; ++id_list_p reserved_cpu_mask_list_p = NULL; ++uint64_t node_info_time_stamp = 0; ++ ++ ++int read_one_line(char *buf, int buf_size, char *fname) { ++ int fd = open(fname, O_RDONLY, 0); ++ if (fd < 0) { ++ return fd; ++ } ++ int bytes = read(fd, buf, buf_size); ++ if (buf[bytes - 1] == '\n') { ++ buf[bytes - 1] = '\0'; ++ } ++ close(fd); ++ return bytes; ++} + + + typedef struct node_data { +@@ -355,6 +416,16 @@ typedef struct node_data { + + node_data_p node = NULL; + ++uint64_t min_node_CPUs_free = MAXINT; ++uint64_t min_node_MBs_free = MAXINT; ++uint64_t max_node_CPUs_free = 0; ++uint64_t max_node_MBs_free = 0; ++uint64_t avg_node_CPUs_free = 0; ++uint64_t avg_node_MBs_free = 0; ++double stddev_node_CPUs_free = 0.0; ++double stddev_node_MBs_free = 0.0; ++ ++ + // RING_BUF_SIZE must be a power of two + #define RING_BUF_SIZE 8 + +@@ -366,14 +437,14 @@ typedef struct process_data { + uint64_t data_time_stamp; // hundredths of seconds + uint64_t bind_time_stamp; + uint64_t num_threads; ++ uint64_t MBs_size; + uint64_t MBs_used; + uint64_t cpu_util; + uint64_t CPUs_used; // scaled * ONE_HUNDRED + uint64_t CPUs_used_ring_buf[RING_BUF_SIZE]; + int ring_buf_ix; +- int dup_bind_count; + char *comm; +- char *cpuset_name; ++ id_list_p node_list_p; + } process_data_t, *process_data_p; + + +@@ -454,12 +525,15 @@ int process_hash_update(process_data_p n + } + p->CPUs_used = max_CPUs_used; + } ++// FIXME: seems like this comm check should not be necessary every update ++// But it does happen only for candidates that cross the memory threshold... + if ((!p->comm) || (strcmp(p->comm, newp->comm))) { + if (p->comm) { + free(p->comm); + } + p->comm = strdup(newp->comm); + } ++ p->MBs_size = newp->MBs_size; + p->MBs_used = newp->MBs_used; + p->cpu_util = newp->cpu_util; + p->num_threads = newp->num_threads; +@@ -468,6 +542,11 @@ int process_hash_update(process_data_p n + return new_hash_table_entry; + } + ++void process_hash_clear_all_bind_time_stamps() { ++ for (int ix = 0; (ix < process_hash_table_size); ix++) { ++ process_hash_table[ix].bind_time_stamp = 0; ++ } ++} + + int process_hash_rehash(int old_ix) { + // Given the index of a table entry that would otherwise be orphaned by +@@ -489,7 +568,7 @@ int process_hash_remove(int pid) { + // remove the target + process_data_p dp = &process_hash_table[ix]; + if (dp->comm) { free(dp->comm); } +- if (dp->cpuset_name) { free(dp->cpuset_name); } ++ FREE_LIST(dp->node_list_p); + memset(dp, 0, sizeof(process_data_t)); + // bubble up the collision chain and rehash if neeeded + for (;;) { +@@ -543,15 +622,29 @@ void process_hash_table_dump() { + process_data_p p = &process_hash_table[ix]; + if (p->pid) { + numad_log(LOG_DEBUG, +- "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld Data TS: %ld Bind TS: %ld\n", ++ "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld/%ld Data TS: %ld Bind TS: %ld\n", + ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads, +- p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp); ++ p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp); ++ // FIXME: make this dump every field ++ } ++ } ++} ++ ++void remove_obsolete_cpuset_if_no_tasks(int pid) { ++ // PID parameter has already been checked via kill(0) and seems dead ++ char buf[BUF_SIZE]; ++ char fname[FNAME_SIZE]; ++ snprintf(fname, FNAME_SIZE, "%s/numad.%d/tasks", cpuset_dir, pid); ++ if ((access(fname, F_OK) == 0) && (read_one_line(buf, BUF_SIZE, fname) <= 1)) { ++ snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid); ++ numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname); ++ if (rmdir(fname) < 0) { ++ numad_log(LOG_ERR, "bad cpuset rmdir\n"); + } + } + } + + void process_hash_table_cleanup(uint64_t update_time) { +- int cpusets_removed = 0; + int num_hash_entries_used = 0; + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; +@@ -562,40 +655,56 @@ void process_hash_table_cleanup(uint64_t + p->data_time_stamp = 0; + p->CPUs_used = 0; + // Check for dead pids and remove them... +- char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); +- if (access(fname, F_OK) < 0) { ++ if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) { + // Seems dead. Forget this pid -- after first checking + // and removing obsolete numad.PID cpuset directories. +- snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid); +- if (access(fname, F_OK) == 0) { +- numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname); +- int rc = rmdir(fname); +- if (rc >= 0) { +- cpusets_removed += 1; +- } else { +- numad_log(LOG_ERR, "bad cpuset rmdir\n"); +- // exit(EXIT_FAILURE); +- } +- } ++ remove_obsolete_cpuset_if_no_tasks(p->pid); + process_hash_remove(p->pid); + num_hash_entries_used -= 1; + } + } + } + } +- if (cpusets_removed > 0) { +- // Expire all the duplicate bind counts so things will be re-evaluated sooner. +- for (int ix = 0; (ix < process_hash_table_size); ix++) { +- process_hash_table[ix].dup_bind_count = 0; +- } +- } + // Keep hash table approximately half empty + if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) { + process_hash_table_expand(); + } + } + ++static int name_starts_with_numad(const struct dirent *dptr) { ++ return (strncmp(dptr->d_name, "numad.", 6) == 0); ++} ++ ++void *clean_obsolete_cpusets(void *arg) { ++ // int arg_value = *(int *)arg; ++ for (;;) { ++ // Loop here forever (slowly) cleaning obsolete cpusets ++ sleep(571); // Arbitrary number a little less than ten minutes ++ struct dirent **namelist; ++ int files = scandir(cpuset_dir, &namelist, name_starts_with_numad, NULL); ++ if (files < 0) { ++ numad_log(LOG_ERR, "Troubled scanning for obsolete cpusets\n"); ++ continue; ++ } ++ for (int ix = 0; (ix < files); ix++) { ++ char *p = &(namelist[ix]->d_name[6]); ++ if (isdigit(*p)) { ++ int pid; ++ CONVERT_DIGITS_TO_NUM(p, pid); ++ // If it seems like a valid PID -- that is NOT in the hash ++ // table -- and the process appears to be dead, then try to ++ // delete the cpuset directory. (Dead PIDs we know about in ++ // the hash table will be cleaned separately.) ++ if ((pid > 10) && (process_hash_lookup(pid) < 0) ++ && (kill(pid, 0) == -1) && (errno == ESRCH)) { ++ remove_obsolete_cpuset_if_no_tasks(pid); ++ } ++ } ++ free(namelist[ix]); ++ } ++ free(namelist); ++ } ++} + + + typedef struct pid_list { +@@ -610,9 +719,7 @@ pid_list_p insert_pid_into_pid_list(pid_ + if (process_hash_table != NULL) { + int hash_ix = process_hash_lookup(pid); + if ((hash_ix >= 0) && (list_ptr == include_pid_list)) { +- // Clear dup_bind_count and interleaved flag, +- // in case user wants it to be re-evaluated soon +- process_hash_table[hash_ix].dup_bind_count = 0; ++ // Clear interleaved flag, in case user wants it to be re-evaluated + process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED; + } + } +@@ -678,17 +785,22 @@ void print_version_and_exit(char *prog_n + + void print_usage_and_exit(char *prog_name) { + fprintf(stderr, "Usage: %s ...\n", prog_name); ++ fprintf(stderr, "-C 1 to count inactive file cache as available memory (default 1)\n"); ++ fprintf(stderr, "-C 0 to count inactive file cache memory as unavailable (default 1)\n"); + fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n"); + fprintf(stderr, "-D to specify cgroup mount point\n"); + fprintf(stderr, "-h to print this usage info\n"); ++ fprintf(stderr, "-H to set THP scan_sleep_ms (default 1000)\n"); + fprintf(stderr, "-i [:] to specify interval seconds\n"); +- fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes\n"); +- fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes\n"); +- fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7)\n"); ++ fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes (default 0)\n"); ++ fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes (default 0)\n"); ++ fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7 -- default 5)\n"); + fprintf(stderr, "-p to add PID to inclusion pid list\n"); + fprintf(stderr, "-r to remove PID from explicit pid lists\n"); +- fprintf(stderr, "-S 1 to scan all processes\n"); +- fprintf(stderr, "-S 0 to scan only explicit PID list processes\n"); ++ fprintf(stderr, "-R to reserve some CPUs for non-numad use\n"); ++ fprintf(stderr, "-S 1 to scan all processes (default 1)\n"); ++ fprintf(stderr, "-S 0 to scan only explicit PID list processes (default 1)\n"); ++ fprintf(stderr, "-t to specify thread / logical CPU percent (default 20)\n"); + fprintf(stderr, "-u to specify target utilization percent (default 85)\n"); + fprintf(stderr, "-v for verbose (same effect as '-l 6')\n"); + fprintf(stderr, "-V to show version info\n"); +@@ -698,6 +810,32 @@ void print_usage_and_exit(char *prog_nam + } + + ++void set_thp_scan_sleep_ms(int new_ms) { ++ if (new_ms < 1) { ++ // 0 means do not change the system default ++ return; ++ } ++ char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs"; ++ int fd = open(thp_scan_fname, O_RDWR, 0); ++ if (fd >= 0) { ++ char buf[BUF_SIZE]; ++ int bytes = read(fd, buf, BUF_SIZE); ++ if (bytes > 0) { ++ int cur_ms; ++ char *p = buf; ++ CONVERT_DIGITS_TO_NUM(p, cur_ms); ++ if (cur_ms != new_ms) { ++ lseek(fd, 0, SEEK_SET); ++ numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms); ++ sprintf(buf, "%d\n", new_ms); ++ write(fd, buf, strlen(buf)); ++ } ++ } ++ close(fd); ++ } ++} ++ ++ + void check_prereqs(char *prog_name) { + // Verify cpusets are available on this system. + char **dir = &cpuset_dir_list[0]; +@@ -730,30 +868,8 @@ void check_prereqs(char *prog_name) { + fprintf(stderr, "\n"); + exit(EXIT_FAILURE); + } +- // Check on THP scan sleep time. +- char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs"; +- int fd = open(thp_scan_fname, O_RDONLY, 0); +- if (fd >= 0) { +- int ms; +- char buf[BUF_SIZE]; +- int bytes = read(fd, buf, BUF_SIZE); +- close(fd); +- if (bytes > 0) { +- char *p = buf; +- CONVERT_DIGITS_TO_NUM(p, ms); +- if (ms > 150) { +- fprintf(stderr, "\n"); +- numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); +- fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); +- fprintf(stderr, "Consider increasing the frequency of THP scanning,\n"); +- fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname); +- fprintf(stderr, "to more aggressively (re)construct THPs. For example:\n"); +- fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n"); +- fprintf(stderr, "\n"); +- } +- } +- } +- // FIXME: ?? check for enabled ksmd, and recommend disabling ksm? ++ // Adjust kernel tunable to scan for THP more frequently... ++ set_thp_scan_sleep_ms(thp_scan_sleep_ms); + } + + +@@ -831,6 +947,43 @@ fail_numad_run_file: + } + + ++int count_set_bits_in_hex_list_file(char *fname) { ++ int sum = 0; ++ int fd = open(fname, O_RDONLY, 0); ++ if (fd >= 0) { ++ char buf[BUF_SIZE]; ++ int bytes = read(fd, buf, BUF_SIZE); ++ close(fd); ++ for (int ix = 0; (ix < bytes); ix++) { ++ char c = tolower(buf[ix]); ++ switch (c) { ++ case '0' : sum += 0; break; ++ case '1' : sum += 1; break; ++ case '2' : sum += 1; break; ++ case '3' : sum += 2; break; ++ case '4' : sum += 1; break; ++ case '5' : sum += 2; break; ++ case '6' : sum += 2; break; ++ case '7' : sum += 3; break; ++ case '8' : sum += 1; break; ++ case '9' : sum += 2; break; ++ case 'a' : sum += 2; break; ++ case 'b' : sum += 3; break; ++ case 'c' : sum += 2; break; ++ case 'd' : sum += 3; break; ++ case 'e' : sum += 3; break; ++ case 'f' : sum += 4; break; ++ case ' ' : sum += 0; break; ++ case ',' : sum += 0; break; ++ case '\n' : sum += 0; break; ++ default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE); ++ } ++ } ++ } ++ return sum; ++} ++ ++ + int get_num_cpus() { + int n1 = sysconf(_SC_NPROCESSORS_CONF); + int n2 = sysconf(_SC_NPROCESSORS_ONLN); +@@ -916,129 +1069,244 @@ static int name_starts_with_digit(const + } + + +-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) { +- // Check basic parameter validity. +- if (pid <= 0) { ++int write_to_cpuset_file(char *fname, char *s) { ++ int fd = open(fname, O_WRONLY | O_TRUNC, 0); ++ if (fd == -1) { ++ numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno); ++ return -1; ++ } ++ numad_log(LOG_DEBUG, "Writing %s to: %s\n", s, fname); ++ if (write(fd, s, strlen(s)) <= 0) { ++ numad_log(LOG_CRIT, "Could not write %s to %s -- errno: %d\n", s, fname, errno); ++ return -1; ++ } ++ close(fd); ++ return 0; ++} ++ ++int configure_cpuset(char *cpuset_name, char *node_list_str, char *cpu_list_str) { ++ int rc = 0; ++ char fname[FNAME_SIZE]; ++ // Write "1" out to cpuset.memory_migrate file ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name); ++ rc += write_to_cpuset_file(fname, "1"); ++ // For memory binding, write node IDs out to cpuset.mems file ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name); ++ rc += write_to_cpuset_file(fname, node_list_str); ++ // For CPU binding, write CPU IDs out to cpuset.cpus file ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name); ++ rc += write_to_cpuset_file(fname, cpu_list_str); ++ return rc; ++} ++ ++int bind_process_and_migrate_memory(process_data_p p) { ++ char buf[BUF_SIZE]; ++ char fname[FNAME_SIZE]; ++ char pid_cpuset_name[FNAME_SIZE]; ++ uint64_t t0 = get_time_stamp(); ++ // Parameter p is a pointer to an element in the hash table ++ if ((!p) || (p->pid < 1)) { + numad_log(LOG_CRIT, "Bad PID to bind\n"); + exit(EXIT_FAILURE); + } +- if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) { +- numad_log(LOG_CRIT, "Bad cpuset name to bind\n"); ++ if (!p->node_list_p) { ++ numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n"); + exit(EXIT_FAILURE); + } +- int nodes; +- if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) { +- numad_log(LOG_CRIT, "Cannot bind to unspecified node\n"); +- exit(EXIT_FAILURE); ++ // Get cpuset name for this PID, or make a new cpuset if necessary ++ snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", p->pid); ++ if (read_one_line(buf, BUF_SIZE, fname) <= 0) { ++ numad_log(LOG_WARNING, "Could not get cpuset of PID %d.\n", p->pid); ++ return 0; // Assume the process terminated + } +- // Cpu_list_p is optional and may be NULL... +- // Generate CPU id list from the specified node list if necessary +- if (cpu_list_p == NULL) { +- static id_list_p tmp_cpu_list_p; +- CLEAR_LIST(tmp_cpu_list_p); +- int node_id = 0; +- while (nodes) { +- if (ID_IS_IN_LIST(node_id, node_list_p)) { +- OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p); +- nodes -= 1; +- } +- node_id += 1; +- } +- cpu_list_p = tmp_cpu_list_p; +- } +- // Make the cpuset directory if necessary +- char cpuset_name_buf[FNAME_SIZE]; +- snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name); +- char *p = &cpuset_name_buf[strlen(cpuset_dir)]; +- if (!strcmp(p, "/")) { +- // Make a cpuset directory for this process +- snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid); +- numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf); +- int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +- if (rc == -1) { ++ if (!strcmp(buf, "/")) { ++ // Default cpuset name, so make a new cpuset directory for this PID ++ snprintf(pid_cpuset_name, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid); ++ numad_log(LOG_NOTICE, "Making new cpuset: %s\n", pid_cpuset_name); ++ if (mkdir(pid_cpuset_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) < 0) { + numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno); + return 0; + } ++ // Temporarily enable all CPUs for a new cpuset... ++ char all_cpus_list_buf[BUF_SIZE]; ++ str_from_id_list(all_cpus_list_buf, BUF_SIZE, all_cpus_list_p); ++ // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name); ++ if (write_to_cpuset_file(fname, all_cpus_list_buf) < 0) { ++ numad_log(LOG_CRIT, "Could not configure cpuset.cpus: %s\n", pid_cpuset_name); ++ return 0; // Assume the process terminated ++ } ++ } else { ++ // Save the existing nondefault cpuset name for this PID ++ snprintf(pid_cpuset_name, FNAME_SIZE, "%s%s", cpuset_dir, buf); + } +- cpuset_name = cpuset_name_buf; +- // Now that we have a cpuset for pid and a populated cpulist, +- // start the actual binding and migration. +- uint64_t t0 = get_time_stamp(); +- ++ // Configure the main PID cpuset with desired nodes and memory migrate ++ // flag. Defer the CPU binding for the main PID until after the PID is ++ // actually written to the task file and the memory has been moved. ++ char node_list_buf[BUF_SIZE]; ++ str_from_id_list(node_list_buf, BUF_SIZE, p->node_list_p); + // Write "1" out to cpuset.memory_migrate file +- char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name); ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", pid_cpuset_name); ++ if (write_to_cpuset_file(fname, "1") < 0) { ++ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name); ++ return 0; // Assume the process terminated ++ } ++ // For memory binding, write node IDs out to cpuset.mems file ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", pid_cpuset_name); ++ if (write_to_cpuset_file(fname, node_list_buf) < 0) { ++ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name); ++ return 0; // Assume the process terminated ++ } ++ // Open the main PID cpuset tasks file and ++ // bind the main PID in the main cpuset now. ++ snprintf(fname, FNAME_SIZE, "%s/tasks", pid_cpuset_name); + int fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno); +- return 0; ++ if (fd < 0) { ++ numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno); ++ return 0; // Assume the process terminated + } +- write(fd, "1", 1); +- close(fd); +- +- // Write node IDs out to cpuset.mems file +- char node_list_buf[BUF_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno); +- return 0; ++ numad_log(LOG_NOTICE, "Including PID: %d in cpuset: %s\n", p->pid, pid_cpuset_name); ++ char pid_str[FNAME_SIZE]; ++ snprintf(pid_str, FNAME_SIZE, "%d", p->pid); ++ if (write(fd, pid_str, strlen(pid_str)) <= 0) { ++ numad_log(LOG_CRIT, "Could not write %s to cpuset: %s -- errno: %d\n", pid_str, pid_cpuset_name, errno); ++ close(fd); ++ return 0; // Assume the process terminated + } +- int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p); +- write(fd, node_list_buf, len); +- close(fd); +- +- // Write CPU IDs out to cpuset.cpus file +- char cpu_list_buf[BUF_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno); +- return 0; ++ // Generate CPU binding list derived from node bind list. ++ static id_list_p cpu_bind_list_p; ++ CLEAR_CPU_LIST(cpu_bind_list_p); ++ int nodes = NUM_IDS_IN_LIST(p->node_list_p); ++ int node_id = 0; ++ while (nodes) { ++ if (ID_IS_IN_LIST(node_id, p->node_list_p)) { ++ OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p); ++ nodes -= 1; ++ } ++ node_id += 1; + } +- len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p); +- write(fd, cpu_list_buf, len); +- close(fd); +- +- // Copy pid tasks one at a time to tasks file +- snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno); +- return 0; ++ char cpu_bind_list_buf[BUF_SIZE]; ++ str_from_id_list(cpu_bind_list_buf, BUF_SIZE, cpu_bind_list_p); ++ // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID ++ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name); ++ if (write_to_cpuset_file(fname, cpu_bind_list_buf) < 0) { ++ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name); ++ return 0; // Assume the process terminated + } +- snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid); ++ // Leave fd open in case process is multithreaded and we need to write more ++ // (sub) task IDs there. In case multithreaded, make sure all the subtasks ++ // for this PID are in a cpuset. If not already in cpuset, put them in the ++ // main cpuset. Start by getting the name list of all tasks for this PID. + struct dirent **namelist; +- int files = scandir(fname, &namelist, name_starts_with_digit, NULL); +- if (files < 0) { +- numad_log(LOG_WARNING, "Could not scandir task list\n"); ++ snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid); ++ int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL); ++ if (num_tasks <= 0) { ++ numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid); ++ close(fd); + return 0; // Assume the process terminated + } +- for (int ix = 0; (ix < files); ix++) { +- // copy pid tasks, one at a time +- numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name); +- write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)); +- free(namelist[ix]); ++ if (num_tasks == 1) { ++ // This is the normal nonthreaded case. No sub tasks -- only the ++ // single main PID task, which is already bound above... ++ free(namelist[0]); ++ } else { ++ // Multithreaded so check all of the multiple subtasks. Avoid redundant ++ // subtask cpuset configuration by keeping a list of unique cpusets as ++ // we check each subtask. If the subtasks have only default cpuset ++ // names, bind those subtasks into the main cpuset with the main PID ++ // instead of adding them to the list. (cpuset_list is static so we ++ // can reuse the allocated array of pointers.) ++ int num_names = 0; ++ static char **cpuset_list; ++ static int cpuset_list_size; ++ for (int ix = 0; (ix < num_tasks); ix++) { ++ // Check the cpuset name for each task ++ if (!strcmp(namelist[ix]->d_name, pid_str)) { ++ // This is the main PID task, which is already bound above. Skip it here. ++ free(namelist[ix]); ++ continue; ++ } ++ snprintf(fname, FNAME_SIZE, "/proc/%d/task/%s/cpuset", p->pid, namelist[ix]->d_name); ++ if (read_one_line(buf, BUF_SIZE, fname) <= 0) { ++ numad_log(LOG_WARNING, "Could not open %s. Assuming thread completed.\n", fname); ++ free(namelist[ix]); ++ continue; ++ } ++ if (strcmp(buf, "/")) { ++ // Subtask already has a nondefault cpuset name. Add this ++ // subtask cpuset name to the list of unique cpuset names. Do ++ // sequential search comparisons first to verify uniqueness. ++ snprintf(fname, FNAME_SIZE, "%s%s", cpuset_dir, buf); ++ int iy = 0; ++ while (iy < num_names) { ++ if (!strcmp(fname, cpuset_list[iy])) { ++ break; // because we already have this cpuset name in the list ++ } ++ iy += 1; ++ } ++ if (iy == num_names) { ++ // We got to the end of the cpulist, so this is a new cpuset name not yet in the list ++ if (num_names == cpuset_list_size) { ++ if (cpuset_list_size == 0) { ++ cpuset_list_size = 10; ++ } else { ++ cpuset_list_size *= 2; ++ } ++ cpuset_list = realloc(cpuset_list, (cpuset_list_size * sizeof(char *))); ++ if (cpuset_list == NULL) { ++ numad_log(LOG_CRIT, "realloc failed\n"); ++ exit(EXIT_FAILURE); ++ } ++ } ++ // Configure this subtask cpuset and, if successful, save a ++ // copy of the name in the unique cpuset list. ++ if (configure_cpuset(fname, node_list_buf, cpu_bind_list_buf) < 0) { ++ numad_log(LOG_WARNING, "Could not configure cpuset %s. Assuming thread completed.\n", fname); ++ free(namelist[ix]); ++ continue; ++ } else { ++ cpuset_list[num_names++] = strdup(fname); ++ } ++ } ++ } else { ++ // This task ID has the default cpuset name. Just add this task ID to the main PID cpuset. ++ numad_log(LOG_NOTICE, "Including task: %s in cpuset: %s\n", namelist[ix]->d_name, pid_cpuset_name); ++ if (write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)) <= 0) { ++ numad_log(LOG_WARNING, "Could not write to cpuset: %s -- errno: %d\n", pid_cpuset_name, errno); ++ free(namelist[ix]); ++ continue; // Assuming thread completed. ++ } ++ } ++ free(namelist[ix]); ++ } ++ // Done with subtask unique cpuset names for this PID. Free them. ++ for (int ix = 0; (ix < num_names); ix++) { ++ free(cpuset_list[ix]); ++ } + } + free(namelist); + close(fd); +- +- uint64_t t1 = get_time_stamp(); + // Check pid still active +- snprintf(fname, FNAME_SIZE, "/proc/%d", pid); ++ snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); + if (access(fname, F_OK) < 0) { +- numad_log(LOG_WARNING, "Could not migrate pid\n"); +- return 0; // Assume the process terminated ++ numad_log(LOG_WARNING, "Could not migrate pid %d\n", p->pid); ++ return 0; ++ } else { ++ uint64_t t1 = get_time_stamp(); ++ p->bind_time_stamp = t1; ++ numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_buf, (t1-t0)/100, (t1-t0)%100); ++ return 1; + } +- numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100); +- return 1; + } + + + void show_nodes() { +- time_t ts = time(NULL); +- fprintf(log_fs, "%s", ctime(&ts)); +- fprintf(log_fs, "Nodes: %d\n", num_nodes); ++ fprintf(log_fs, "\n"); ++ numad_log(LOG_INFO, "Nodes: %d\n", num_nodes); ++ fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", ++ min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free); ++ fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", ++ min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free); + for (int ix = 0; (ix < num_nodes); ix++) { + fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ", + ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free); +@@ -1049,7 +1317,6 @@ void show_nodes() { + str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p); + fprintf(log_fs, " CPUs: %s\n", buf); + } +- fprintf(log_fs, "\n"); + fflush(log_fs); + } + +@@ -1065,7 +1332,7 @@ int cur_cpu_data_buf = 0; + + void update_cpu_data() { + // Parse idle percents from CPU stats in /proc/stat cpu lines +- static FILE *fs = NULL; ++ static FILE *fs; + if (fs != NULL) { + rewind(fs); + } else { +@@ -1107,7 +1374,8 @@ void update_cpu_data() { + while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice + while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system + while (!isdigit(*p)) { p++; } +- uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); } ++ uint64_t idle; ++ CONVERT_DIGITS_TO_NUM(p, idle); + cpu_data_buf[new].idle[cpu_id] = idle; + } + } +@@ -1129,10 +1397,6 @@ int node_and_digits(const struct dirent + } + + +-id_list_p all_cpus_list_p = NULL; +-id_list_p all_nodes_list_p = NULL; +-uint64_t node_info_time_stamp = 0; +- + + int update_nodes() { + char fname[FNAME_SIZE]; +@@ -1141,6 +1405,7 @@ int update_nodes() { + uint64_t time_stamp = get_time_stamp(); + #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED) + if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) { ++ node_info_time_stamp = time_stamp; + // Count directory names of the form: /sys/devices/system/node/node + struct dirent **namelist; + int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL); +@@ -1167,8 +1432,15 @@ int update_nodes() { + } + num_nodes = num_files; + } +- CLEAR_LIST(all_cpus_list_p); +- CLEAR_LIST(all_nodes_list_p); ++ sum_CPUs_total = 0; ++ CLEAR_CPU_LIST(all_cpus_list_p); ++ CLEAR_NODE_LIST(all_nodes_list_p); ++ // Figure out how many threads per core there are (for later discounting of hyper-threads) ++ threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings"); ++ if (threads_per_core < 1) { ++ numad_log(LOG_CRIT, "Could not count threads per core\n"); ++ exit(EXIT_FAILURE); ++ } + // For each "node" filename present, save in node[ix].node_id + // Note that the node id might not necessarily match the node ix. + // Also populate the cpu lists and distance vectors for this node. +@@ -1185,10 +1457,22 @@ int update_nodes() { + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { + // get cpulist from the cpulist string +- CLEAR_LIST(node[node_ix].cpu_list_p); ++ CLEAR_CPU_LIST(node[node_ix].cpu_list_p); + int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf); ++ if (reserved_cpu_str != NULL) { ++ AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p); ++ n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); ++ } + OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p); +- node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ // Calculate total CPUs, but possibly discount hyper-threads ++ if ((threads_per_core == 1) || (htt_percent >= 100)) { ++ node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ } else { ++ n /= threads_per_core; ++ node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent; ++ } ++ sum_CPUs_total += node[node_ix].CPUs_total; + close(fd); + } else { + numad_log(LOG_CRIT, "Could not get node cpu list\n"); +@@ -1220,14 +1504,28 @@ int update_nodes() { + } + free(namelist); + } +- // Second, get the dynamic free memory and available CPU capacity ++ // Second, update the dynamic free memory and available CPU capacity ++ while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) { ++ // Make sure at least 7/100 of a second has passed. ++ // Otherwise sleep for 1/10 second. ++ struct timespec ts = { 0, 100000000 }; ++ nanosleep(&ts, &ts); ++ time_stamp = get_time_stamp(); ++ } + update_cpu_data(); ++ max_node_MBs_free = 0; ++ max_node_CPUs_free = 0; ++ min_node_MBs_free = MAXINT; ++ min_node_CPUs_free = MAXINT; ++ uint64_t sum_of_node_MBs_free = 0; ++ uint64_t sum_of_node_CPUs_free = 0; + for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { + int node_id = node[node_ix].node_id; + // Get available memory info from node/meminfo file + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id); + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { ++ close(fd); + uint64_t KB; + char *p = strstr(buf, "MemTotal:"); + if (p != NULL) { +@@ -1238,7 +1536,7 @@ int update_nodes() { + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); +- node[node_ix].MBs_total = KB / KILOBYTE; ++ node[node_ix].MBs_total = (KB / KILOBYTE); + p = strstr(p, "MemFree:"); + if (p != NULL) { + p += 8; +@@ -1248,8 +1546,27 @@ int update_nodes() { + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); +- node[node_ix].MBs_free = KB / KILOBYTE; +- close(fd); ++ node[node_ix].MBs_free = (KB / KILOBYTE); ++ if (use_inactive_file_cache) { ++ // Add inactive file cache quantity to "free" memory ++ p = strstr(p, "Inactive(file):"); ++ if (p != NULL) { ++ p += 15; ++ } else { ++ numad_log(LOG_CRIT, "Could not get node Inactive(file)\n"); ++ exit(EXIT_FAILURE); ++ } ++ while (!isdigit(*p)) { p++; } ++ CONVERT_DIGITS_TO_NUM(p, KB); ++ node[node_ix].MBs_free += (KB / KILOBYTE); ++ } ++ sum_of_node_MBs_free += node[node_ix].MBs_free; ++ if (min_node_MBs_free > node[node_ix].MBs_free) { ++ min_node_MBs_free = node[node_ix].MBs_free; ++ } ++ if (max_node_MBs_free < node[node_ix].MBs_free) { ++ max_node_MBs_free = node[node_ix].MBs_free; ++ } + } else { + numad_log(LOG_CRIT, "Could not get node meminfo\n"); + exit(EXIT_FAILURE); +@@ -1260,7 +1577,8 @@ int update_nodes() { + if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) { + uint64_t idle_ticks = 0; + int cpu = 0; +- int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED; ++ int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); ++ int num_cpus_to_process = num_lcpus; + while (num_cpus_to_process) { + if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) { + idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu] +@@ -1274,15 +1592,45 @@ int update_nodes() { + // printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks); + // assert(time_diff > 0); + node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff; ++ // Possibly discount hyper-threads ++ if ((threads_per_core > 1) && (htt_percent < 100)) { ++ uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent); ++ if (node[node_ix].CPUs_free > htt_discount) { ++ node[node_ix].CPUs_free -= htt_discount; ++ } else { ++ node[node_ix].CPUs_free = 0; ++ } ++ } + if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) { + node[node_ix].CPUs_free = node[node_ix].CPUs_total; + } ++ sum_of_node_CPUs_free += node[node_ix].CPUs_free; ++ if (min_node_CPUs_free > node[node_ix].CPUs_free) { ++ min_node_CPUs_free = node[node_ix].CPUs_free; ++ } ++ if (max_node_CPUs_free < node[node_ix].CPUs_free) { ++ max_node_CPUs_free = node[node_ix].CPUs_free; ++ } + node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free; + } else { + node[node_ix].CPUs_free = 0; + node[node_ix].magnitude = 0; + } + } ++ avg_node_MBs_free = sum_of_node_MBs_free / num_nodes; ++ avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes; ++ double MBs_variance_sum = 0.0; ++ double CPUs_variance_sum = 0.0; ++ for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { ++ double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free; ++ double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free; ++ MBs_variance_sum += MBs_diff * MBs_diff; ++ CPUs_variance_sum += CPUs_diff * CPUs_diff; ++ } ++ double MBs_variance = MBs_variance_sum / (num_nodes); ++ double CPUs_variance = CPUs_variance_sum / (num_nodes); ++ stddev_node_MBs_free = sqrt(MBs_variance); ++ stddev_node_CPUs_free = sqrt(CPUs_variance); + if (log_level >= LOG_INFO) { + show_nodes(); + } +@@ -1316,7 +1664,7 @@ typedef struct stat_data { + int64_t num_threads; // 19 + int64_t itrealvalue; + uint64_t starttime; +- uint64_t vsize; ++ uint64_t vsize; // 22 + int64_t rss; // 23 + uint64_t rsslim; + uint64_t startcode; +@@ -1361,10 +1709,11 @@ process_data_p get_stat_data_for_pid(int + return NULL; + } + close(fd); ++ uint64_t val; + char *p = buf; + static process_data_t data; + // Get PID from field 0 +- uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.pid = val; + // Copy comm from field 1 + while (*p == ' ') { p++; } +@@ -1373,23 +1722,27 @@ process_data_p get_stat_data_for_pid(int + // Skip fields 2 through 12 + for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } + // Get utime from field 13 for cpu_util +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.cpu_util = val; + // Get stime from field 14 to add on to cpu_util (which already has utime) + while (*p == ' ') { p++; } +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.cpu_util += val; + // Skip fields 15 through 18 + while (*p == ' ') { p++; } + for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } + // Get num_threads from field 19 +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.num_threads = val; +- // Skip fields 20 through 22 ++ // Skip fields 20 through 21 + while (*p == ' ') { p++; } +- for (int ix = 0; (ix < 3); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } ++ for (int ix = 0; (ix < 2); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } ++ // Get vsize from field 22 to compute MBs_size ++ CONVERT_DIGITS_TO_NUM(p, val); ++ data.MBs_size = val / MEGABYTE; + // Get rss from field 23 to compute MBs_used +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ while (*p == ' ') { p++; } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.MBs_used = (val * page_size_in_bytes) / MEGABYTE; + // Return pointer to data + return &data; +@@ -1471,20 +1824,79 @@ int update_processes() { + } + + ++int initialize_mem_node_list(process_data_p p) { ++ // Parameter p is a pointer to an element in the hash table ++ if ((!p) || (p->pid < 1)) { ++ numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n"); ++ exit(EXIT_FAILURE); ++ } ++ int n = 0; ++ char fname[FNAME_SIZE]; ++ char buf[BIG_BUF_SIZE]; ++ CLEAR_NODE_LIST(p->node_list_p); ++ snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid); ++ int fd = open(fname, O_RDONLY, 0); ++ if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { ++ close(fd); ++ char *list_str_p = strstr(buf, "Mems_allowed_list:"); ++ if (!list_str_p) { ++ numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n"); ++ exit(EXIT_FAILURE); ++ } ++ list_str_p += 18; ++ while (!isdigit(*list_str_p)) { list_str_p++; } ++ n = add_ids_to_list_from_str(p->node_list_p, list_str_p); ++ } else { ++ numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid); ++ return 0; // Assume the process terminated ++ } ++ if (n < num_nodes) { ++ // If process already bound to a subset of nodes when we discover it, ++ // set initial bind_time_stamp to 30 minutes ago... ++ p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED); ++ } ++ return n; ++} ++ ++ + +-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { +- char buf[BUF_SIZE]; +- char buf2[BUF_SIZE]; ++ ++uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) { ++ int64_t needed_mem; ++ int64_t needed_cpu; ++ int64_t excess_mem; ++ int64_t excess_cpu; ++ if (MBs_free > mbs) { ++ needed_mem = mbs; ++ excess_mem = MBs_free - mbs; ++ } else { ++ needed_mem = MBs_free; ++ excess_mem = 0; ++ } ++ if (CPUs_free > cpus) { ++ needed_cpu = cpus; ++ excess_cpu = CPUs_free - cpus; ++ } else { ++ needed_cpu = CPUs_free; ++ excess_cpu = 0; ++ } ++ // Weight the available resources, and then calculate magnitude as ++ // product of available CPUs and available MBs. ++ int64_t memfactor = (needed_mem * 10 + excess_mem * 3); ++ int64_t cpufactor = (needed_cpu * 8 + excess_cpu * 1); ++ numad_log(LOG_DEBUG, " Node[%d]: mem: %ld cpu: %ld\n", ix, memfactor, cpufactor); ++ return (memfactor * cpufactor); ++} ++ ++ ++id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) { + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs); + } +- int num_existing_mems = 0; +- static id_list_p existing_mems_list_p; +- CLEAR_LIST(existing_mems_list_p); +- uint64_t time_stamp = get_time_stamp(); ++ char buf[BUF_SIZE]; ++ uint64_t process_CPUs = 0; + static node_data_p tmp_node; + static uint64_t *process_MBs; +- static uint64_t *saved_magnitude_for_node; + static int process_MBs_num_nodes; + // See if dynamic structures need to grow. + if (process_MBs_num_nodes < num_nodes + 1) { +@@ -1492,121 +1904,25 @@ id_list_p pick_numa_nodes(int pid, int c + // The "+1 node" is for accumulating interleaved memory + process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t)); + tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); +- saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t)); +- if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) { ++ if ((process_MBs == NULL) || (tmp_node == NULL)) { + numad_log(LOG_CRIT, "process_MBs realloc failed\n"); + exit(EXIT_FAILURE); + } + } ++ + // For existing processes, get miscellaneous process specific details + int pid_ix; + process_data_p p = NULL; + if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) { + p = &process_hash_table[pid_ix]; +- // Quick rejection if this process has interleaved memory, but recheck it once an hour... +-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED) +- if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) +- && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); +- } +- return NULL; +- } +- // Get cpuset name for this process, and existing mems binding, if any. +- char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid); +- FILE *fs = fopen(fname, "r"); +- if (!fs) { +- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid); +- return NULL; // Assume the process terminated? +- } +- if (!fgets(buf, BUF_SIZE, fs)) { +- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid); +- fclose(fs); +- return NULL; // Assume the process terminated? +- } +- fclose(fs); +- ELIM_NEW_LINE(buf); +- if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) { +- if (p->cpuset_name != NULL) { +- free(p->cpuset_name); +- } +- p->cpuset_name = strdup(buf); +- } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name); +- } +- snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name); +- fs = fopen(fname, "r"); +- if ((fs) && (fgets(buf, BUF_SIZE, fs))) { +- fclose(fs); +- num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf); +- if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf); +- } +- } +- // If this process was just recently bound, enforce a minimum delay +- // period between repeated attempts to potentially move the memory. +- // FIXME: ?? might this retard appropriate process expansion too much? +-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED) +- if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { +- // Skip re-evaluation because we just did it recently. +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n"); +- } +- return NULL; +- } +- // Look for short cut because of duplicate bindings. If we have bound +- // this process to the same nodes multiple times already, and the load +- // on those nodes still seems acceptable, skip the rest of this and +- // just return NULL to indicate no change needed. FIXME: should figure +- // out what can change that would make a rebinding desirable (e.g. (1) +- // some process gets sub-optimal allocation on busy machine which +- // subsequently becomes less busy leaving disadvantaged process. (2) +- // node load imbalance, (3) any process split across nodes which should +- // fit within a single node.) For now, just expire the dup_bid_count +- // occasionally, which is a reasonably good mitigation. +- // So, check to see if we should decay the dup_bind_count... +-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED) +- if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) { +- p->dup_bind_count -= 1; +- } +- // Now, look for short cut because of duplicate bindings +- if (p->dup_bind_count > 0) { +- int node_id = 0; +- int nodes_have_cpu = 1; +- int nodes_have_ram = 1; +- int n = num_existing_mems; +- int min_resource_pct = 100 - target_utilization; +- if (min_resource_pct < 5) { +- min_resource_pct = 5; +- } +- while (n) { +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct)); +- nodes_have_ram &= ((100 * node[node_id].MBs_free / node[node_id].MBs_total) >= (min_resource_pct)); +- n -= 1; +- } +- node_id += 1; +- } +- if ((nodes_have_cpu) && (nodes_have_ram)) { +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n"); +- } +- return NULL; +- } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram); +- } +- } +- // Fourth, add up per-node memory in use by this process. This scanning +- // is expensive and should be minimized. Also, old kernels dismantle +- // transparent huge pages while producing the numa_maps memory +- // information! ++ // Correct current CPUs amount for utilization factor inflation ++ process_CPUs = (cpus * target_utilization) / 100; ++ // Add up per-node memory in use by this process. ++ // This scanning is expensive and should be minimized. + memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t)); ++ char fname[FNAME_SIZE]; + snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid); +- fs = fopen(fname, "r"); ++ FILE *fs = fopen(fname, "r"); + if (!fs) { + numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid); + return NULL; // Assume the process terminated +@@ -1645,84 +1961,103 @@ id_list_p pick_numa_nodes(int pid, int c + fclose(fs); + for (int ix = 0; (ix <= num_nodes); ix++) { + process_MBs[ix] /= MEGABYTE; +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]); ++ if (p->bind_time_stamp) { ++ if ((process_MBs[ix]) && (!ID_IS_IN_LIST(ix, p->node_list_p))) { ++ // FIXME: If process previously bound, but memory appears ++ // to exist where it should not, this might identify ++ // processes for which the kernel does not move all the ++ // memory for whatever reason.... Must check for ++ // significant amount before doing anything about it, ++ // however, since memory for libraries, etc, can get moved ++ // around. ++ } ++ } else { ++ // If process has not yet been bound, set node list to existing nodes with memory ++ if (process_MBs[ix]) { ++ ADD_ID_TO_LIST(ix, p->node_list_p); ++ } else { ++ CLR_ID_IN_LIST(ix, p->node_list_p); ++ } ++ } ++ if ((log_level >= LOG_DEBUG) && (process_MBs[ix] > 0)) { ++ if (ix == num_nodes) { ++ numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, process_MBs[ix]); ++ } else { ++ numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]); ++ } + } + } + if ((process_has_interleaved_memory) && (keep_interleaved_memory)) { + // Mark this process as having interleaved memory so we do not +- // merge the interleaved memory. Time stamp it as done. ++ // merge the interleaved memory. Time stamp it as done and return. + p->flags |= PROCESS_FLAG_INTERLEAVED; + p->bind_time_stamp = get_time_stamp(); + if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); + } + return NULL; + } + } // end of existing PID conditional ++ + // Make a copy of node available resources array. Add in info specific to + // this process to equalize available resource quantities wrt locations of +- // resources already in use by this process. Inflate the value of already +- // assigned memory by approximately 3/2, because moving memory is +- // expensive. Average the amount of CPUs_free across the existing nodes +- // used, because the threads are free to move around in that domain. After +- // calculating combined magnitude of available resources, bias the values +- // towards existing locations for this process. +- int target_using_all_nodes = 0; +- uint64_t node_CPUs_free_for_this_process = 0; ++ // resources already in use by this process. After calculating weighted ++ // magnitude of available resources, bias the values towards existing ++ // locations for this process. + memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); +- if (num_existing_mems > 0) { +- node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation? +- int node_id = 0; +- int n = num_existing_mems; +- while (n) { +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free; +- n -= 1; +- } +- node_id += 1; +- } +- // Divide to get average CPUs_free for the nodes in use by process +- node_CPUs_free_for_this_process /= num_existing_mems; +- } + for (int ix = 0; (ix < num_nodes); ix++) { +- if (pid > 0) { +- tmp_node[ix].MBs_free += ((process_MBs[ix] * 12) / 8); +- } +- if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) { +- tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process; ++ // Add back (biased) memory already used by this process on this node ++ tmp_node[ix].MBs_free += ((process_MBs[ix] * 8) / 8); // FIXME: apply bias here? ++ if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) { ++ tmp_node[ix].MBs_free = tmp_node[ix].MBs_total; ++ } ++ // Add back CPU in proportion to amount of memory already used on this ++ // node Making assumption here that CPU execution threads are actually ++ // running on the same nodes where memory is assigned... FIXME: should ++ // we perhaps do this only if process already explicitly bound? ++ uint64_t prorated_CPU = (process_CPUs * process_MBs[ix]) / mbs; ++ if ((log_level >= LOG_DEBUG) && (prorated_CPU > 0)) { ++ numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, prorated_CPU); + } ++ tmp_node[ix].CPUs_free += prorated_CPU; + if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) { + tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; + } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free); ++ if (tmp_node[ix].CPUs_free < 1) { ++ // enforce 1/100th CPU minimum ++ tmp_node[ix].CPUs_free = 1; + } +- // Calculate magnitude as product of available CPUs and available MBs +- tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free; ++ // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); ++ tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); + // Bias combined magnitude towards already assigned nodes +- if (ID_IS_IN_LIST(ix, existing_mems_list_p)) { +- tmp_node[ix].magnitude *= 9; +- tmp_node[ix].magnitude /= 8; ++ if ((pid > 0) && (ID_IS_IN_LIST(ix, p->node_list_p))) { ++ tmp_node[ix].magnitude *= 17; ++ tmp_node[ix].magnitude /= 16; + } +- // Save the current magnitudes +- saved_magnitude_for_node[ix] = tmp_node[ix].magnitude; + } +- // OK, figure out where to get resources for this request. +- static id_list_p target_node_list_p; +- CLEAR_LIST(target_node_list_p); ++ ++ // Figure out where to get resources for this request. + int prev_node_used = -1; +- // Continue to allocate more resources until request are met. +- // OK if not not quite all the CPU request is met. +- // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing? +- int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; +- if (pid <= 0) { +- // If trying to find resources for pre-placement advice request, do not +- // underestimate the amount of CPUs needed. Instead, err on the side +- // of providing too many resources. So, no flexing here... +- cpu_flex = 0; ++ static id_list_p target_node_list_p; ++ CLEAR_NODE_LIST(target_node_list_p); ++ // Establish a CPU flex fudge factor, on the presumption it is OK if not ++ // quite all the CPU request is met. However, if trying to find resources ++ // for pre-placement advice request, do not underestimate the amount of ++ // CPUs needed. Instead, err on the side of providing too many resources. ++ int cpu_flex = 0; ++ if ((pid > 0) && (target_utilization < 100)) { ++ // FIXME: Is half of the utilization margin a good amount of CPU flexing? ++ cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; ++ } ++ // Figure out minimum number of nodes required ++ int mem_req_nodes = ceil((double)mbs / (double)node[0].MBs_total); ++ int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); ++ int min_req_nodes = mem_req_nodes; ++ if (min_req_nodes < cpu_req_nodes) { ++ min_req_nodes = cpu_req_nodes; + } +- while ((mbs > 0) || (cpus > cpu_flex)) { ++ // Continue to allocate more resources until request are met. ++ while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) { + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); + } +@@ -1760,22 +2095,18 @@ id_list_p pick_numa_nodes(int pid, int c + // last one we used. This is not going to make progress... So + // just punt and use everything. + OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p); +- target_using_all_nodes = 1; + break; + } + prev_node_used = tmp_node[0].node_id; + ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p); +- if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- str_from_id_list(buf2, BUF_SIZE, target_node_list_p); +- numad_log(LOG_DEBUG, "Existing nodes: %s Target nodes: %s\n", buf, buf2); +- } ++ min_req_nodes -= 1; + if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) { + // Apparently we must use all resource nodes... +- target_using_all_nodes = 1; + break; + } +-#define MBS_MARGIN 10 ++ // "Consume" the resources on this node ++#define CPUS_MARGIN 0 ++#define MBS_MARGIN 100 + if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) { + tmp_node[0].MBs_free -= mbs; + mbs = 0; +@@ -1783,7 +2114,6 @@ id_list_p pick_numa_nodes(int pid, int c + mbs -= (tmp_node[0].MBs_free - MBS_MARGIN); + tmp_node[0].MBs_free = MBS_MARGIN; + } +-#define CPUS_MARGIN 0 + if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) { + tmp_node[0].CPUs_free -= cpus; + cpus = 0; +@@ -1791,126 +2121,52 @@ id_list_p pick_numa_nodes(int pid, int c + cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN); + tmp_node[0].CPUs_free = CPUS_MARGIN; + } +- tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free; ++ tmp_node[0].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[0].MBs_free, tmp_node[0].CPUs_free); + } +- // If this existing process is already located where we want it, and almost +- // all memory is already moved to those nodes, then return NULL indicating +- // no need to change binding this time. +- if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) { +- // May not need to change binding. However, if there is any significant +- // memory still on non-target nodes, advise the bind anyway because +- // there are some scenarios when the kernel will not move it all the +- // first time. +- if (!target_using_all_nodes) { +- p->dup_bind_count += 1; +- for (int ix = 0; (ix < num_nodes); ix++) { +- if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) { +- goto try_memory_move_again; +- } +- } +- // We will accept these memory locations. Stamp it as done. +- p->bind_time_stamp = get_time_stamp(); +- } +- // Skip rebinding either because practically all memory is in the +- // target nodes, or because we are stuck using all the nodes. ++ ++ // If this existing process is already located where we want it, then just ++ // return NULL indicating no need to change binding this time. ++ if ((pid > 0) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) { + if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n"); ++ numad_log(LOG_DEBUG, "Process %d already bound to target nodes.\n", p->pid); + } ++ p->bind_time_stamp = get_time_stamp(); + return NULL; +- } else { +- // Either a non-existing process, or a new binding for an existing process. +- if (p != NULL) { +- // Must be a new binding for an existing process, so reset dup_bind_count. +- p->dup_bind_count = 0; +- } +- } +- // See if this proposed move will make a significant difference. +- // If not, return null instead of advising the move. +- uint64_t target_magnitude = 0; +- uint64_t existing_magnitude = 0; +- int num_target_nodes = NUM_IDS_IN_LIST(target_node_list_p); +- int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p); +- /* FIXME: this expansion seems to cause excessive growth +- * So calculate the improvement before hastily expanding nodes. +- if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; } +- */ +- int node_id = 0; +- int n = num_existing_nodes + num_target_nodes; +- while (n) { +- if (ID_IS_IN_LIST(node_id, target_node_list_p)) { +- target_magnitude += saved_magnitude_for_node[node_id]; +- n -= 1; +- } +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- existing_magnitude += saved_magnitude_for_node[node_id]; +- n -= 1; +- } +- node_id += 1; +- } +- if (existing_magnitude > 0) { +- uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude; +- if (magnitude_change < 0) { +- magnitude_change = -(magnitude_change); +- } +- if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) { +- // Not significant enough percentage change to do rebind +- if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- str_from_id_list(buf2, BUF_SIZE, target_node_list_p); +- numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n", +- pid, buf, buf2, magnitude_change); +- } +- // We decided this is almost good enough. Stamp it as done. +- p->bind_time_stamp = get_time_stamp(); +- return NULL; +- } + } +- if ((pid <= 0) && (num_target_nodes <= 0)) { +- // Always provide at least one node for pre-placement advice ++ // Must always provide at least one node for pre-placement advice ++ // FIXME: verify this can happen only if no resources requested... ++ if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) { + ADD_ID_TO_LIST(node[0].node_id, target_node_list_p); + } +-try_memory_move_again: +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); ++ // Log advice, and return target node list ++ if ((pid > 0) && (p->bind_time_stamp)) { ++ str_from_id_list(buf, BUF_SIZE, p->node_list_p); ++ } else { ++ str_from_id_list(buf, BUF_SIZE, all_nodes_list_p); ++ } ++ char buf2[BUF_SIZE]; + str_from_id_list(buf2, BUF_SIZE, target_node_list_p); + char *cmd_name = "(unknown)"; + if ((p) && (p->comm)) { + cmd_name = p->comm; + } + numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2); ++ ++ if (pid > 0) { ++ // FIXME: Consider moving this out to caller?? ++ COPY_LIST(target_node_list_p, p->node_list_p); ++ } + return target_node_list_p; + } + + + +-void show_processes(process_data_p *ptr, int nprocs) { +- time_t ts = time(NULL); +- fprintf(log_fs, "%s", ctime(&ts)); +- fprintf(log_fs, "Candidates: %d\n", nprocs); +- for (int ix = 0; (ix < nprocs); ix++) { +- process_data_p p = ptr[ix]; +- char buf[BUF_SIZE]; +- snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name); +- FILE *fs = fopen(buf, "r"); +- buf[0] = '\0'; +- if (fs) { +- if (fgets(buf, BUF_SIZE, fs)) { +- ELIM_NEW_LINE(buf); +- } +- fclose(fs); +- } +- fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", +- p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); +- } +- fprintf(log_fs, "\n"); +- fflush(log_fs); +-} +- +- + + int manage_loads() { ++ uint64_t time_stamp = get_time_stamp(); + // Use temporary index to access and sort hash table entries +- static process_data_p *pindex; + static int pindex_size; ++ static process_data_p *pindex; + if (pindex_size < process_hash_table_size) { + pindex_size = process_hash_table_size; + pindex = realloc(pindex, pindex_size * sizeof(process_data_p)); +@@ -1923,34 +2179,69 @@ int manage_loads() { + return min_interval / 2; + } + memset(pindex, 0, pindex_size * sizeof(process_data_p)); +- // Copy live candidate pointers to the index for sorting, etc ++ // Copy live candidate pointers to the index for sorting ++ // if they meet the threshold for memory usage and CPU usage. + int nprocs = 0; ++ long sum_CPUs_used = 0; + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; +- if (p->pid) { ++ if ((p->pid) && (p->CPUs_used * p->MBs_used > CPU_THRESHOLD * MEMORY_THRESHOLD)) { + pindex[nprocs++] = p; ++ sum_CPUs_used += p->CPUs_used; ++ // Initialize node list, if not already done for this process. ++ if (p->node_list_p == NULL) { ++ initialize_mem_node_list(p); ++ } + } + } +- // Sort index by amount of CPU used * amount of memory used. Not expecting +- // a long list here. Use a simple sort -- however, sort into bins, +- // treating values within 10% as aquivalent. Within bins, order by +- // bind_time_stamp so oldest bound will be higher priority to evaluate. ++ // Order candidate considerations using timestamps and magnitude: amount of ++ // CPU used * amount of memory used. Not expecting a long list here. Use ++ // a simplistic sort -- however move all not yet bound to front of list and ++ // order by decreasing magnitude. Previously bound processes follow in ++ // bins of increasing magnitude treating values within 20% as aquivalent. ++ // Within bins, order by bind_time_stamp so oldest bound will be higher ++ // priority to evaluate. Start by moving all unbound to beginning. ++ int num_unbound = 0; + for (int ij = 0; (ij < nprocs); ij++) { ++ if (pindex[ij]->bind_time_stamp == 0) { ++ process_data_p tmp = pindex[num_unbound]; ++ pindex[num_unbound++] = pindex[ij]; ++ pindex[ij] = tmp; ++ } ++ } ++ // Sort all unbound so biggest magnitude comes first ++ for (int ij = 0; (ij < num_unbound); ij++) { ++ int best = ij; ++ for (int ik = ij + 1; (ik < num_unbound); ik++) { ++ uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_size); ++ uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size); ++ if (ik_mag <= best_mag) continue; ++ best = ik; ++ } ++ if (best != ij) { ++ process_data_p tmp = pindex[ij]; ++ pindex[ij] = pindex[best]; ++ pindex[best] = tmp; ++ } ++ } ++ // Sort the remaining candidates into bins of increasting magnitude, and by ++ // timestamp within bins. ++ for (int ij = num_unbound; (ij < nprocs); ij++) { + int best = ij; + for (int ik = ij + 1; (ik < nprocs); ik++) { +- uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); +- uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used); ++ uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_size); ++ uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size); + uint64_t min_mag = ik_mag; + uint64_t diff_mag = best_mag - ik_mag; + if (diff_mag < 0) { + diff_mag = -(diff_mag); + min_mag = best_mag; + } +- if ((diff_mag > 0) && (min_mag / diff_mag < 10)) { +- // difference > 10 percent. Use strict ordering +- if (ik_mag <= best_mag) continue; ++ if ((diff_mag > 0) && (min_mag / diff_mag < 5)) { ++ // difference > 20 percent. Use magnitude ordering ++ if (ik_mag >= best_mag) continue; + } else { +- // difference within 10 percent. Sort these by bind_time_stamp. ++ // difference within 20 percent. Sort these by bind_time_stamp. + if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue; + } + best = ik; +@@ -1961,23 +2252,69 @@ int manage_loads() { + pindex[best] = tmp; + } + } ++ // Show the candidate processes in the log file + if ((log_level >= LOG_INFO) && (nprocs > 0)) { +- show_processes(pindex, nprocs); ++ numad_log(LOG_INFO, "Candidates: %d\n", nprocs); ++ for (int ix = 0; (ix < nprocs); ix++) { ++ process_data_p p = pindex[ix]; ++ char buf[BUF_SIZE]; ++ str_from_id_list(buf, BUF_SIZE, p->node_list_p); ++ fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", ++ p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); ++ } ++ fflush(log_fs); + } +- // Estimate desired size and make resource requests for each significant process ++ // Estimate desired size (+ margin capacity) and ++ // make resource requests for each candidate process + for (int ix = 0; (ix < nprocs); ix++) { + process_data_p p = pindex[ix]; +- if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) { +- break; // No more significant processes worth worrying about... ++ // If this process was recently bound, enforce a three-minute minimum ++ // delay between repeated attempts to potentially move the process. ++ // FIXME: make this delay contingent on node resource equity? Or, ++ // maybe change in running averages? Perhaps detect change in averages, ++ // or look at stddev? What is a good range for the delay? Discrete or ++ // continuous? ++#define MIN_DELAY_FOR_REEVALUATION (180 * ONE_HUNDRED) ++ if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { ++ // Skip re-evaluation because we just did it recently. ++ if (log_level >= LOG_DEBUG) { ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid); ++ } ++ continue; ++ } ++ // If this process has interleaved memory, recheck it only every 30 minutes... ++#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED) ++ if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) ++ && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { ++ if (log_level >= LOG_DEBUG) { ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); ++ } ++ continue; + } +- int mb_request = (p->MBs_used * 100) / target_utilization; +- int cpu_request = (p->CPUs_used * 100) / target_utilization; +- // Do not give a process more CPUs than it has threads! +- // FIXME: For guest VMs, should limit max to VCPU threads. Will +- // need to do something more intelligent with guest IO threads +- // when eventually considering devices and IRQs. ++ // Expand resources needed estimate using target_utilization factor. ++ // Start with the CPUs actually used (capped by number of threads) for ++ // CPUs required, but use the process virtual memory size for MBs ++ // requirement, (We previously used the RSS for MBs needed, but that ++ // caused problems with processes that had quickly expanding memory ++ // usage which also needed to cross NUMA boundaries. The downside of ++ // this choice is we might not pack processes as tightly as possible ++ // anymore. Hopefully this will be a relatively rare occurence in ++ // practice. KVM guests should not be significantly over-provisioned ++ // with memory they will never use!) ++ int mem_target_utilization = target_utilization; ++ int cpu_target_utilization = target_utilization; ++ // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe) ++ if (mem_target_utilization > 100) { ++ mem_target_utilization = 100; ++ } ++ int mb_request = (p->MBs_size * 100) / mem_target_utilization; ++ int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization; ++ // But do not give a process more CPUs than it has threads! + int thread_limit = p->num_threads; +- // If process looks like a KVM guest, try to limit to number of vCPU threads ++ // If process looks like a KVM guest, try to limit thread count to the ++ // number of vCPU threads. FIXME: Will need to do something more ++ // intelligent than this with guest IO threads when eventually ++ // considering devices and IRQs. + if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) { + int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid); + if (thread_limit > kvm_vcpu_threads) { +@@ -1988,18 +2325,18 @@ int manage_loads() { + if (cpu_request > thread_limit) { + cpu_request = thread_limit; + } ++ // OK, now pick NUMA nodes for this process and bind it! + pthread_mutex_lock(&node_info_mutex); +- id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request); +- // FIXME: ?? copy node_list_p to shorten mutex region? +- if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) { +- // Shorten interval if actively moving processes ++ int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total); ++ id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus); ++ if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) { + pthread_mutex_unlock(&node_info_mutex); +- p->bind_time_stamp = get_time_stamp(); ++ // Return minimum interval when actively moving processes + return min_interval; + } + pthread_mutex_unlock(&node_info_mutex); + } +- // Return maximum interval if no process movement ++ // Return maximum interval when no process movement + return max_interval; + } + +@@ -2013,6 +2350,18 @@ void *set_dynamic_options(void *arg) { + msg_t msg; + recv_msg(&msg); + switch (msg.body.cmd) { ++ case 'C': ++ use_inactive_file_cache = (msg.body.arg1 != 0); ++ if (use_inactive_file_cache) { ++ numad_log(LOG_NOTICE, "Counting inactive file cache as available\n"); ++ } else { ++ numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n"); ++ } ++ break; ++ case 'H': ++ thp_scan_sleep_ms = msg.body.arg1; ++ set_thp_scan_sleep_ms(thp_scan_sleep_ms); ++ break; + case 'i': + min_interval = msg.body.arg1; + max_interval = msg.body.arg2; +@@ -2055,6 +2404,11 @@ void *set_dynamic_options(void *arg) { + numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); + } + break; ++ case 't': ++ numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1); ++ htt_percent = msg.body.arg1; ++ node_info_time_stamp = 0; // to force rescan of nodes/cpus soon ++ break; + case 'u': + numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); + target_utilization = msg.body.arg1; +@@ -2064,7 +2418,7 @@ void *set_dynamic_options(void *arg) { + msg.body.arg1, msg.body.arg2); + pthread_mutex_lock(&node_info_mutex); + update_nodes(); +- id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2); ++ id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0); + str_from_id_list(buf, BUF_SIZE, node_list_p); + pthread_mutex_unlock(&node_info_mutex); + send_msg(msg.body.src_pid, 'w', 0, 0, buf); +@@ -2134,20 +2488,28 @@ void parse_two_arg_values(char *p, int * + + int main(int argc, char *argv[]) { + int opt; ++ int C_flag = 0; + int d_flag = 0; ++ int H_flag = 0; + int i_flag = 0; + int K_flag = 0; + int l_flag = 0; + int p_flag = 0; + int r_flag = 0; + int S_flag = 0; ++ int t_flag = 0; + int u_flag = 0; + int v_flag = 0; + int w_flag = 0; + int x_flag = 0; ++ int tmp_int = 0; + long list_pid = 0; +- while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) { ++ while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) { + switch (opt) { ++ case 'C': ++ C_flag = 1; ++ use_inactive_file_cache = (atoi(optarg) != 0); ++ break; + case 'd': + d_flag = 1; + log_level = LOG_DEBUG; +@@ -2158,6 +2520,17 @@ int main(int argc, char *argv[]) { + case 'h': + print_usage_and_exit(argv[0]); + break; ++ case 'H': ++ tmp_int = atoi(optarg); ++ if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) { ++ // 0 means do not change the system default value ++ H_flag = 1; ++ thp_scan_sleep_ms = tmp_int; ++ } else { ++ fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n"); ++ exit(EXIT_FAILURE); ++ } ++ break; + case 'i': + i_flag = 1; + parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0); +@@ -2183,13 +2556,26 @@ int main(int argc, char *argv[]) { + include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); + break; ++ case 'R': ++ reserved_cpu_str = strdup(optarg); ++ break; + case 'S': + S_flag = 1; + scan_all_processes = (atoi(optarg) != 0); + break; ++ case 't': ++ tmp_int = atoi(optarg); ++ if ((tmp_int >= 0) && (tmp_int <= 100)) { ++ t_flag = 1; ++ htt_percent = tmp_int; ++ } ++ break; + case 'u': +- u_flag = 1; +- target_utilization = atoi(optarg); ++ tmp_int = atoi(optarg); ++ if ((tmp_int >= 10) && (tmp_int <= 130)) { ++ u_flag = 1; ++ target_utilization = tmp_int; ++ } + break; + case 'v': + v_flag = 1; +@@ -2234,6 +2620,12 @@ int main(int argc, char *argv[]) { + // Daemon is already running. So send dynamic options to persistant + // thread to handle requests, get the response (if any), and finish. + msg_t msg; ++ if (C_flag) { ++ send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, ""); ++ } ++ if (H_flag) { ++ send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, ""); ++ } + if (i_flag) { + send_msg(daemon_pid, 'i', min_interval, max_interval, ""); + } +@@ -2252,6 +2644,9 @@ int main(int argc, char *argv[]) { + if (S_flag) { + send_msg(daemon_pid, 'S', scan_all_processes, 0, ""); + } ++ if (t_flag) { ++ send_msg(daemon_pid, 't', htt_percent, 0, ""); ++ } + if (u_flag) { + send_msg(daemon_pid, 'u', target_utilization, 0, ""); + } +@@ -2263,14 +2658,30 @@ int main(int argc, char *argv[]) { + if (x_flag) { + send_msg(daemon_pid, 'x', list_pid, 0, ""); + } +- } else if (w_flag) { +- // Get pre-placement NUMA advice without starting daemon ++ close_log_file(); ++ exit(EXIT_SUCCESS); ++ } ++ // No numad daemon running yet. ++ // First, make note of any reserved CPUs.... ++ if (reserved_cpu_str != NULL) { ++ CLEAR_CPU_LIST(reserved_cpu_mask_list_p); ++ int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str); + char buf[BUF_SIZE]; ++ str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p); ++ numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf); ++ // turn reserved list into a negated mask for later ANDing use... ++ negate_list(reserved_cpu_mask_list_p); ++ } ++ // If it is a "-w" pre-placement request, handle that without starting ++ // the daemon. Otherwise start the numad daemon. ++ if (w_flag) { ++ // Get pre-placement NUMA advice without starting daemon + update_nodes(); + sleep(2); + update_nodes(); + numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs); +- id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs); ++ id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0); ++ char buf[BUF_SIZE]; + str_from_id_list(buf, BUF_SIZE, node_list_p); + fprintf(stdout, "%s\n", buf); + close_log_file(); +@@ -2278,6 +2689,7 @@ int main(int argc, char *argv[]) { + } else if (max_interval > 0) { + // Start the numad daemon... + check_prereqs(argv[0]); ++#if (!NO_DAEMON) + // Daemonize self... + daemon_pid = fork(); + if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); } +@@ -2298,9 +2710,21 @@ int main(int argc, char *argv[]) { + if (log_fs != stderr) { + fclose(stderr); + } ++#endif ++ // Set up signal handlers ++ struct sigaction sa; ++ memset(&sa, 0, sizeof(sa)); ++ sa.sa_handler = sig_handler; ++ if (sigaction(SIGHUP, &sa, NULL) ++ || sigaction(SIGTERM, &sa, NULL) ++ || sigaction(SIGQUIT, &sa, NULL)) { ++ numad_log(LOG_CRIT, "sigaction does not work?\n"); ++ exit(EXIT_FAILURE); ++ } + // Allocate initial process hash table + process_hash_table_expand(); +- // Spawn thread to handle messages from subsequent invocation requests ++ // Spawn a thread to handle messages from subsequent invocation requests ++ // and also a lazy background thread to clean up obsolete cpusets. + pthread_mutex_init(&pid_list_mutex, NULL); + pthread_mutex_init(&node_info_mutex, NULL); + pthread_attr_t attr; +@@ -2310,7 +2734,11 @@ int main(int argc, char *argv[]) { + } + pthread_t tid; + if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) { +- numad_log(LOG_CRIT, "pthread_create failure\n"); ++ numad_log(LOG_CRIT, "pthread_create failure: setting thread\n"); ++ exit(EXIT_FAILURE); ++ } ++ if (pthread_create(&tid, &attr, &clean_obsolete_cpusets, &tid) != 0) { ++ numad_log(LOG_CRIT, "pthread_create failure: cleaning thread\n"); + exit(EXIT_FAILURE); + } + // Loop here forwever... +@@ -2324,14 +2752,20 @@ int main(int argc, char *argv[]) { + interval = manage_loads(); + } + sleep(interval); ++ if (got_sigterm | got_sigquit) { ++ shut_down_numad(); ++ } ++ if (got_sighup) { ++ got_sighup = 0; ++ close_log_file(); ++ open_log_file(); ++ } + } + if (pthread_attr_destroy(&attr) != 0) { + numad_log(LOG_WARNING, "pthread_attr_destroy failure\n"); + } + pthread_mutex_destroy(&pid_list_mutex); + pthread_mutex_destroy(&node_info_mutex); +- } else { +- shut_down_numad(); + } + exit(EXIT_SUCCESS); + } +--- numad-0.5git/numad.8 2012-12-03 15:40:40.000000000 +0100 ++++ new-rhel7/numad.8 2014-02-27 10:03:07.000000000 +0100 +@@ -8,9 +8,15 @@ management for efficient use of CPUs and + numad [\fI\-dhvV\fP] + .br + .LP ++numad [\fI\-C 0|1\fP] ++.br ++.LP + numad [\fI\-D non-standard-cgroup-mount-point\fP] + .br + .LP ++numad [\fI\-H THP_hugepage_scan_sleep_ms\fP] ++.br ++.LP + numad [\fI\-i [min_interval:]max_interval\fP] + .br + .LP +@@ -26,9 +32,15 @@ numad [\fI\-p PID\fP] + numad [\fI\-r PID\fP] + .br + .LP ++numad [\fI\-R reserved-CPU-list\fP] ++.br ++.LP + numad [\fI\-S 0|1\fP] + .br + .LP ++numad [\fI\-t logical_CPU_percent\fP] ++.br ++.LP + numad [\fI\-u target_utilization\fP] + .br + .LP +@@ -37,7 +49,6 @@ numad [\fI\-w NCPUS[:MB]\fP] + .LP + numad [\fI\-x PID\fP] + .br +- + .SH "DESCRIPTION" + .LP + Numad is a system daemon that monitors NUMA topology and resource usage. It +@@ -54,6 +65,13 @@ accesses will likely remain unpredictabl + performance. + .SH "OPTIONS" + .LP ++.TP ++\fB\-C\fR <\fI0|1\fP> ++This option controls whether or not numad treats inactive file cache as ++available memory. By default, numad assumes it can count inactive file cache as ++"free" memory when considering resources to match with processes. Specify ++\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed ++resource. + .TP + \fB\-d\fR + Debug output in log, sets the log level to LOG_DEBUG. Same effect as \fI\-l 7\fP. +@@ -65,6 +83,16 @@ numad. This is not normally necessary. + \fB\-h\fR + Display usage help information and then exit. + .TP ++\fB\-H\fR <\fITHP_scan_sleep_ms\fP> ++Set the desired transparent hugepage scan interval in ms. The ++/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs tunable is ++usually set to 10000ms by the operating system. The default is changed by ++numad to be 1000ms since it is helpful for the hugepage daemon to be more ++aggressive when memory moves between nodes. If you don't like numad's choice ++of 1000ms, you can make the hugepage daemon more or less aggressive by ++specifying an alternate value with this option. Setting this value to 100ms ++might improve some workloads which use many transparent hugepages. ++.TP + \fB\-i\fR <\fI[min_interval:]max_interval\fP> + Sets the time interval that numad waits between system scans, in seconds to + <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default +@@ -85,7 +113,9 @@ large in-memory database), you might get + .TP + \fB\-l\fR <\fIlog_level\fP> + Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7. +-The default value is 5. ++The default value is 5. Note that CPU values are scaled by a factor of 100 ++internally and in the numad log files. Unfortunately, you don't actually have ++that many CPUs. + .TP + \fB\-p\fR <\fIPID\fP> + Add PID to explicit inclusion list of processes to consider for managing, if +@@ -102,6 +132,12 @@ processes. After daemon start, only one + process lists per subsequent numad invocation. Use with \-S and \-p and \-x to + precisely control the scope of processes numad can manage. + .TP ++\fB\-R\fR <\fICPU_LIST\fP> ++Specify a list of CPUs that numad should assume are reserved for non-numad use. ++No processes will be bound to the specified CPUs by numad. This option is ++effective only when starting numad. You cannot change reserved CPUs ++dynamically while numad is already running. ++.TP + \fB\-S\fR <\fI0|1\fP> + This option controls whether numad scans all system processes or only the + processes on the explicit inclusion PID list. The default is to scan all +@@ -114,10 +150,19 @@ exclusion list). Starting numad as + will limit scanning, and thus also automatic NUMA management, to only those + three explicitly specified processes. + .TP ++\fB\-t\fR <\fIlogical_CPU_percent\fP> ++Determine the resource value of logical CPUs. Hardware threads typically share ++most core resources, and so add only a fraction of CPU power for many ++workloads. By default numad considers logical CPUs to be only 20 percent of a ++dedicated core. ++.TP + \fB\-u\fR <\fItarget_utilization\fP> + Set the desired maximum consumption percentage of a node. Default is 85%. + Decrease the target value to maintain more available resource margin on each + node. Increase the target value to more exhaustively consume node resources. ++It is possible to specify values up to 130 percent, to oversubscribe CPUs in ++the nodes, but memory utilization is capped at 100%. Use oversubscription ++values carefully. + .TP + \fB\-v\fR + Verbose output in log, sets the log level to LOG_INFO. Same effect as \fI\-l 6\fP. +@@ -159,18 +204,21 @@ numad can manage. + None. + .SH "EXAMPLES" + .LP +-Numad is normally run as a system daemon and should be managed by the ++Numad can be run as a system daemon and can be managed by the + standard init mechanisms of the host. + .LP + If interactive (manual) control is desired, you can start the daemon manually by typing: + .LP + /usr/bin/numad + .LP +-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options. ++Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options. ++.LP ++You can terminate numad from running by typing: ++.LP ++/usr/bin/numad -i0 + .SH "AUTHORS" + .LP + Bill Gray + .SH "SEE ALSO" + .LP + numactl(8) +- diff --git a/SOURCES/numad-0.5git-version.patch b/SOURCES/numad-0.5git-version.patch new file mode 100644 index 00000000..27d36d7d --- /dev/null +++ b/SOURCES/numad-0.5git-version.patch @@ -0,0 +1,2639 @@ +diff -rup numad-0.5git/numad.8 numad-0.5git-new/numad.8 +--- numad-0.5git/numad.8 2012-12-03 15:40:40.000000000 +0100 ++++ numad-0.5git-new/numad.8 2016-08-30 08:45:19.000000000 +0200 +@@ -1,45 +1,56 @@ + .TH "numad" "8" "1.0.0" "Bill Gray" "Administration" +-.SH "numad" +-.LP ++.SH "NAME" ++.LP + numad \- A user\-level daemon that provides placement advice and process + management for efficient use of CPUs and memory on systems with NUMA topology. +-.SH "SYNTAX" +-.LP ++.SH "SYNOPSIS" ++.LP + numad [\fI\-dhvV\fP] +-.br +-.LP +-numad [\fI\-D non-standard-cgroup-mount-point\fP] +-.br +-.LP ++.br ++.LP ++numad [\fI\-C 0|1\fP] ++.br ++.LP ++numad [\fI\-H THP_hugepage_scan_sleep_ms\fP] ++.br ++.LP + numad [\fI\-i [min_interval:]max_interval\fP] +-.br +-.LP ++.br ++.LP + numad [\fI\-K 0|1\fP] +-.br +-.LP ++.br ++.LP + numad [\fI\-l log_level\fP] +-.br +-.LP ++.br ++.LP ++numad [\fI\-m target_memory_locality\fP] ++.br ++.LP + numad [\fI\-p PID\fP] +-.br +-.LP ++.br ++.LP + numad [\fI\-r PID\fP] +-.br +-.LP ++.br ++.LP ++numad [\fI\-R reserved-CPU-list\fP] ++.br ++.LP + numad [\fI\-S 0|1\fP] +-.br +-.LP ++.br ++.LP ++numad [\fI\-t logical_CPU_percent\fP] ++.br ++.LP + numad [\fI\-u target_utilization\fP] +-.br +-.LP ++.br ++.LP + numad [\fI\-w NCPUS[:MB]\fP] +-.br +-.LP ++.br ++.LP + numad [\fI\-x PID\fP] +-.br +- ++.br + .SH "DESCRIPTION" +-.LP ++.LP + Numad is a system daemon that monitors NUMA topology and resource usage. It + will attempt to locate processes for efficient NUMA locality and affinity, + dynamically adjusting to changing system conditions. Numad also provides +@@ -53,25 +64,42 @@ large in-memory database application, fo + accesses will likely remain unpredictable -- numad will probably not improve + performance. + .SH "OPTIONS" +-.LP +-.TP ++.LP ++.TP ++\fB\-C\fR <\fI0|1\fP> ++This option controls whether or not numad treats inactive file cache as ++available memory. By default, numad assumes it can count inactive file cache as ++"free" memory when considering resources to match with processes. Specify ++\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed ++resource. ++.TP + \fB\-d\fR + Debug output in log, sets the log level to LOG_DEBUG. Same effect as \fI\-l 7\fP. + .TP +-\fB\-D\fR <\fInon-standard-cgroup-mount-point\fP> +-This option can be used to communicate a non-standard cgroup mount point to +-numad. This is not normally necessary. +-.TP + \fB\-h\fR + Display usage help information and then exit. +-.TP ++.TP ++\fB\-H\fR <\fITHP_scan_sleep_ms\fP> ++Set the desired transparent hugepage scan interval in ms. The ++.na ++/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs ++.ad ++tunable is usually set to 10000ms by the operating system. The default is ++changed by numad to be 1000ms since it is helpful for the hugepage daemon to be ++more aggressive when memory moves between nodes. Specifying (\fI\-H 0\fP) will ++cause numad to retain the system default value. You can also make the hugepage ++daemon more or less aggressive by specifying an alternate value with this ++option. For example, setting this value to 100ms (\fI\-H 100\fP) might improve ++the performance of workloads which use many transparent hugepages. ++.TP + \fB\-i\fR <\fI[min_interval:]max_interval\fP> + Sets the time interval that numad waits between system scans, in seconds to + <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default + <\fImin_interval\fP> is 5 seconds. Setting a <\fImax_interval\fP> of zero will + cause the daemon to exit. (This is the normal mechanism to terminate the + daemon.) A bigger <\fImax_interval\fP> will decrease numad overhead but also +-decrease responsiveness to changing loads. ++decrease responsiveness to changing loads. The default numad max_interval can ++be changed in the numad.conf file. + .TP + \fB\-K\fR <\fI0|1\fP> + This option controls whether numad keeps interleaved memory spread across NUMA +@@ -82,10 +110,24 @@ a large, single-instance application tha + the workload will have continuous unpredictable memory access patterns (e.g. a + large in-memory database), you might get better results by specifying \fI\-K + 1\fP to instruct numad to keep interleaved memory distributed. +-.TP ++.TP + \fB\-l\fR <\fIlog_level\fP> + Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7. +-The default value is 5. ++The default value is 5. Note that CPU values are scaled by a factor of 100 ++internally and in the numad log files. Unfortunately, you don't actually have ++that many CPUs. ++.TP ++\fB\-m\fR <\fItarget_memory_locality\fP> ++Set the desired memory locality threshold to stop moving process memory. Numad ++might stop retrying to coalesce process memory when more than this percentage ++of the process's memory is already localized in the target node(s). The ++default is 90%. Numad will frequently localize more than the localization ++threshold percent, but it will not necessarily do so. Decrease the threshold ++to allow numad to leave more process memory distributed on various nodes. ++Increase the threshold to instruct numad to try to localize more memory. ++Acceptable values are between 50 and 100 percent. Note that setting the target ++memory locality to 100% might cause numad to continually retry to move memory ++that the kernel will never succesfully move. + .TP + \fB\-p\fR <\fIPID\fP> + Add PID to explicit inclusion list of processes to consider for managing, if +@@ -102,6 +144,12 @@ processes. After daemon start, only one + process lists per subsequent numad invocation. Use with \-S and \-p and \-x to + precisely control the scope of processes numad can manage. + .TP ++\fB\-R\fR <\fICPU_LIST\fP> ++Specify a list of CPUs that numad should assume are reserved for non-numad use. ++No processes will be bound to the specified CPUs by numad. This option is ++effective only when starting numad. You cannot change reserved CPUs ++dynamically while numad is already running. ++.TP + \fB\-S\fR <\fI0|1\fP> + This option controls whether numad scans all system processes or only the + processes on the explicit inclusion PID list. The default is to scan all +@@ -113,18 +161,30 @@ exclusion list). Starting numad as + .br + will limit scanning, and thus also automatic NUMA management, to only those + three explicitly specified processes. +-.TP ++.TP ++\fB\-t\fR <\fIlogical_CPU_percent\fP> ++Specify the resource value of logical CPUs. Hardware threads typically share ++most core resources, and so logical CPUs add only a fraction of CPU power for ++many workloads. By default numad considers logical CPUs to be only 20 percent ++of a dedicated hardware core. ++.TP + \fB\-u\fR <\fItarget_utilization\fP> + Set the desired maximum consumption percentage of a node. Default is 85%. + Decrease the target value to maintain more available resource margin on each + node. Increase the target value to more exhaustively consume node resources. +-.TP ++If you have sized your workloads to precisely fit inside a NUMA node, ++specifying (\fI\-u 100\fP) might improve system performance by telling numad to ++go ahead and consume all the resources in each node. It is possible to specify ++values up to 130 percent to oversubscribe CPUs in the nodes, but memory ++utilization is always capped at 100%. Use oversubscription values very ++carefully. ++.TP + \fB\-v\fR + Verbose output in log, sets the log level to LOG_INFO. Same effect as \fI\-l 6\fP. +-.TP ++.TP + \fB\-V\fR + Display version information and exit. +-.TP ++.TP + \fB\-w\fR <\fINCPUS[:MB]\fP> + Queries numad for the best NUMA nodes to bind an entity that needs + <\fINCPUS\fP>. The amount of memory (in MBs) is optional, but should normally +@@ -145,32 +205,37 @@ Add PID to explicit exclusion list of pr + Multiple \fI\-x PID\fP options can be specified at daemon start, but after + daemon start, only one PID can be added to the exclusion list per subsequent + numad invocation. Use with \-S to precisely control the scope of processes +-numad can manage. ++numad can manage. + .SH "FILES" +-.LP +-\fI/usr/bin/numad\fP +-.br +-\fI/var/log/numad.log\fP +-.br +-\fI/var/run/numad.pid\fP ++.LP ++\fI/usr/bin/numad\fP ++.br ++\fI/etc/numad.conf\fP ++.br ++\fI/var/log/numad.log\fP ++.br ++\fI/var/run/numad.pid\fP + .SH "ENVIRONMENT VARIABLES" +-.LP +-.TP ++.LP ++.TP + None. + .SH "EXAMPLES" +-.LP +-Numad is normally run as a system daemon and should be managed by the ++.LP ++Numad can be run as a system daemon and can be managed by the + standard init mechanisms of the host. +-.LP ++.LP + If interactive (manual) control is desired, you can start the daemon manually by typing: +-.LP ++.LP + /usr/bin/numad + .LP +-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options. ++Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options. ++.LP ++You can terminate numad from running by typing: ++.LP ++/usr/bin/numad -i0 + .SH "AUTHORS" +-.LP ++.LP + Bill Gray + .SH "SEE ALSO" +-.LP ++.LP + numactl(8) +- +diff -rup numad-0.5git/numad.c numad-0.5git-new/numad.c +--- numad-0.5git/numad.c 2012-12-03 15:40:40.000000000 +0100 ++++ numad-0.5git-new/numad.c 2016-08-30 08:45:19.000000000 +0200 +@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston + */ + + +-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt ++// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm + + + #define _GNU_SOURCE +@@ -40,6 +40,10 @@ Inc., 59 Temple Place, Suite 330, Boston + #include + #include + #include ++#include ++#include ++#include ++ + #include + #include + #include +@@ -49,26 +53,16 @@ Inc., 59 Temple Place, Suite 330, Boston + #include + #include + #include +-#include +-#include +-#include ++ ++#include + + +-#define VERSION_STRING "20121130" ++#define VERSION_STRING "20150602" + + + #define VAR_RUN_FILE "/var/run/numad.pid" + #define VAR_LOG_FILE "/var/log/numad.log" + +-char *cpuset_dir = NULL; +-char *cpuset_dir_list[] = { +- NULL, +- "/sys/fs/cgroup/cpuset", +- "/cgroup/cpuset", +- NULL +-}; +- +- + #define KILOBYTE (1024) + #define MEGABYTE (1024 * 1024) + +@@ -86,14 +80,11 @@ char *cpuset_dir_list[] = { + #define MAX_INTERVAL 15 + #define CPU_THRESHOLD 50 + #define MEMORY_THRESHOLD 300 +-#define TARGET_UTILIZATION_PERCENT 85 +-#define IMPROVEMENT_THRESHOLD_PERCENT 5 +- ++#define DEFAULT_HTT_PERCENT 20 ++#define DEFAULT_THP_SCAN_SLEEP_MS 1000 ++#define DEFAULT_UTILIZATION_PERCENT 85 ++#define DEFAULT_MEMLOCALITY_PERCENT 90 + +-#define ELIM_NEW_LINE(s) \ +- if (s[strlen(s) - 1] == '\n') { \ +- s[strlen(s) - 1] = '\0'; \ +- } + + #define CONVERT_DIGITS_TO_NUM(p, n) \ + n = *p++ - '0'; \ +@@ -105,19 +96,36 @@ char *cpuset_dir_list[] = { + + int num_cpus = 0; + int num_nodes = 0; +-int page_size_in_bytes = 0; +-int huge_page_size_in_bytes = 0; ++int threads_per_core = 0; ++uint64_t page_size_in_bytes = 0; ++uint64_t huge_page_size_in_bytes = 0; + + int min_interval = MIN_INTERVAL; + int max_interval = MAX_INTERVAL; +-int target_utilization = TARGET_UTILIZATION_PERCENT; ++int htt_percent = DEFAULT_HTT_PERCENT; ++int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS; ++int target_utilization = DEFAULT_UTILIZATION_PERCENT; ++int target_memlocality = DEFAULT_MEMLOCALITY_PERCENT; + int scan_all_processes = 1; + int keep_interleaved_memory = 0; ++int use_inactive_file_cache = 1; + + pthread_mutex_t pid_list_mutex; + pthread_mutex_t node_info_mutex; ++long sum_CPUs_total = 0; + int requested_mbs = 0; + int requested_cpus = 0; ++int got_sighup = 0; ++int got_sigterm = 0; ++int got_sigquit = 0; ++ ++void sig_handler(int signum) { ++ switch (signum) { ++ case SIGHUP: got_sighup = 1; break; ++ case SIGTERM: got_sigterm = 1; break; ++ case SIGQUIT: got_sigquit = 1; break; ++ } ++} + + + +@@ -139,7 +147,7 @@ void numad_log(int level, const char *fm + } + char buf[BUF_SIZE]; + time_t ts = time(NULL); +- sprintf(buf, ctime(&ts)); ++ strncpy(buf, ctime(&ts), sizeof(buf)); + char *p = &buf[strlen(buf) - 1]; + *p++ = ':'; + *p++ = ' '; +@@ -155,13 +163,16 @@ void open_log_file() { + log_fs = fopen(VAR_LOG_FILE, "a"); + if (log_fs == NULL) { + log_fs = stderr; +- numad_log(LOG_ERR, "Cannot open numad log file -- using stderr\n"); ++ numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno); + } + } + ++ + void close_log_file() { + if (log_fs != NULL) { +- fclose(log_fs); ++ if (log_fs != stderr) { ++ fclose(log_fs); ++ } + log_fs = NULL; + } + } +@@ -235,23 +246,32 @@ void send_msg(long dst_pid, long cmd, lo + + + typedef struct id_list { +- // Use CPU_SET(3) cpuset bitmasks, ++ // Use CPU_SET(3) bitmasks, + // but bundle size and pointer together + // and genericize for both CPU and Node IDs + cpu_set_t *set_p; + size_t bytes; + } id_list_t, *id_list_p; + +-#define INIT_ID_LIST(list_p) \ ++#define ID_LIST_SET_P(list_p) (list_p->set_p) ++#define ID_LIST_BYTES(list_p) (list_p->bytes) ++ ++#define INIT_ID_LIST(list_p, num_elements) \ + list_p = malloc(sizeof(id_list_t)); \ + if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \ +- list_p->set_p = CPU_ALLOC(num_cpus); \ ++ list_p->set_p = CPU_ALLOC(num_elements); \ + if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \ +- list_p->bytes = CPU_ALLOC_SIZE(num_cpus); ++ list_p->bytes = CPU_ALLOC_SIZE(num_elements); + +-#define CLEAR_LIST(list_p) \ ++#define CLEAR_CPU_LIST(list_p) \ + if (list_p == NULL) { \ +- INIT_ID_LIST(list_p); \ ++ INIT_ID_LIST(list_p, num_cpus); \ ++ } \ ++ CPU_ZERO_S(list_p->bytes, list_p->set_p) ++ ++#define CLEAR_NODE_LIST(list_p) \ ++ if (list_p == NULL) { \ ++ INIT_ID_LIST(list_p, num_nodes); \ + } \ + CPU_ZERO_S(list_p->bytes, list_p->set_p) + +@@ -262,6 +282,9 @@ typedef struct id_list { + list_p = NULL; \ + } + ++#define COPY_LIST(orig_list_p, copy_list_p) \ ++ memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes) ++ + #define NUM_IDS_IN_LIST(list_p) CPU_COUNT_S(list_p->bytes, list_p->set_p) + #define ADD_ID_TO_LIST(k, list_p) CPU_SET_S(k, list_p->bytes, list_p->set_p) + #define CLR_ID_IN_LIST(k, list_p) CPU_CLR_S(k, list_p->bytes, list_p->set_p) +@@ -272,6 +295,25 @@ typedef struct id_list { + #define OR_LISTS( or_list_p, list_1_p, list_2_p) CPU_OR_S( or_list_p->bytes, or_list_p->set_p, list_1_p->set_p, list_2_p->set_p) + #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p) + ++int negate_cpu_list(id_list_p list_p) { ++ if (list_p == NULL) { ++ numad_log(LOG_CRIT, "Cannot negate a NULL list\n"); ++ exit(EXIT_FAILURE); ++ } ++ if (num_cpus < 1) { ++ numad_log(LOG_CRIT, "No CPUs to negate in list!\n"); ++ exit(EXIT_FAILURE); ++ } ++ for (int ix = 0; (ix < num_cpus); ix++) { ++ if (ID_IS_IN_LIST(ix, list_p)) { ++ CLR_ID_IN_LIST(ix, list_p); ++ } else { ++ ADD_ID_TO_LIST(ix, list_p); ++ } ++ } ++ return NUM_IDS_IN_LIST(list_p); ++} ++ + int add_ids_to_list_from_str(id_list_p list_p, char *s) { + if (list_p == NULL) { + numad_log(LOG_CRIT, "Cannot add to NULL list\n"); +@@ -352,9 +394,21 @@ typedef struct node_data { + uint8_t *distance; + id_list_p cpu_list_p; + } node_data_t, *node_data_p; +- + node_data_p node = NULL; + ++int min_node_CPUs_free_ix = -1; ++int min_node_MBs_free_ix = -1; ++long min_node_CPUs_free = MAXINT; ++long min_node_MBs_free = MAXINT; ++long max_node_CPUs_free = 0; ++long max_node_MBs_free = 0; ++long avg_node_CPUs_free = 0; ++long avg_node_MBs_free = 0; ++double stddev_node_CPUs_free = 0.0; ++double stddev_node_MBs_free = 0.0; ++ ++ ++ + // RING_BUF_SIZE must be a power of two + #define RING_BUF_SIZE 8 + +@@ -366,14 +420,15 @@ typedef struct process_data { + uint64_t data_time_stamp; // hundredths of seconds + uint64_t bind_time_stamp; + uint64_t num_threads; ++ uint64_t MBs_size; + uint64_t MBs_used; + uint64_t cpu_util; + uint64_t CPUs_used; // scaled * ONE_HUNDRED + uint64_t CPUs_used_ring_buf[RING_BUF_SIZE]; + int ring_buf_ix; +- int dup_bind_count; + char *comm; +- char *cpuset_name; ++ id_list_p node_list_p; ++ uint64_t *process_MBs; + } process_data_t, *process_data_p; + + +@@ -433,7 +488,8 @@ int process_hash_insert(int pid) { + } + + int process_hash_update(process_data_p newp) { +- // This updates hash table stats for processes we are monitoring ++ // This updates hash table stats for processes we are monitoring. Only the ++ // scalar resource consumption stats need to be updated here. + int new_hash_table_entry = 1; + int ix = process_hash_insert(newp->pid); + if (ix >= 0) { +@@ -460,6 +516,7 @@ int process_hash_update(process_data_p n + } + p->comm = strdup(newp->comm); + } ++ p->MBs_size = newp->MBs_size; + p->MBs_used = newp->MBs_used; + p->cpu_util = newp->cpu_util; + p->num_threads = newp->num_threads; +@@ -468,6 +525,11 @@ int process_hash_update(process_data_p n + return new_hash_table_entry; + } + ++void process_hash_clear_all_bind_time_stamps() { ++ for (int ix = 0; (ix < process_hash_table_size); ix++) { ++ process_hash_table[ix].bind_time_stamp = 0; ++ } ++} + + int process_hash_rehash(int old_ix) { + // Given the index of a table entry that would otherwise be orphaned by +@@ -489,7 +551,8 @@ int process_hash_remove(int pid) { + // remove the target + process_data_p dp = &process_hash_table[ix]; + if (dp->comm) { free(dp->comm); } +- if (dp->cpuset_name) { free(dp->cpuset_name); } ++ if (dp->process_MBs) { free(dp->process_MBs); } ++ FREE_LIST(dp->node_list_p); + memset(dp, 0, sizeof(process_data_t)); + // bubble up the collision chain and rehash if neeeded + for (;;) { +@@ -543,15 +606,15 @@ void process_hash_table_dump() { + process_data_p p = &process_hash_table[ix]; + if (p->pid) { + numad_log(LOG_DEBUG, +- "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld Data TS: %ld Bind TS: %ld\n", ++ "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld/%ld Data TS: %ld Bind TS: %ld\n", + ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads, +- p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp); ++ p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp); ++ // FIXME: make this dump every field, but this is not even currently used + } + } + } + + void process_hash_table_cleanup(uint64_t update_time) { +- int cpusets_removed = 0; + int num_hash_entries_used = 0; + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; +@@ -562,34 +625,14 @@ void process_hash_table_cleanup(uint64_t + p->data_time_stamp = 0; + p->CPUs_used = 0; + // Check for dead pids and remove them... +- char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); +- if (access(fname, F_OK) < 0) { +- // Seems dead. Forget this pid -- after first checking +- // and removing obsolete numad.PID cpuset directories. +- snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid); +- if (access(fname, F_OK) == 0) { +- numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname); +- int rc = rmdir(fname); +- if (rc >= 0) { +- cpusets_removed += 1; +- } else { +- numad_log(LOG_ERR, "bad cpuset rmdir\n"); +- // exit(EXIT_FAILURE); +- } +- } ++ if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) { ++ // Seems dead. Forget this pid + process_hash_remove(p->pid); + num_hash_entries_used -= 1; + } + } + } + } +- if (cpusets_removed > 0) { +- // Expire all the duplicate bind counts so things will be re-evaluated sooner. +- for (int ix = 0; (ix < process_hash_table_size); ix++) { +- process_hash_table[ix].dup_bind_count = 0; +- } +- } + // Keep hash table approximately half empty + if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) { + process_hash_table_expand(); +@@ -610,9 +653,7 @@ pid_list_p insert_pid_into_pid_list(pid_ + if (process_hash_table != NULL) { + int hash_ix = process_hash_lookup(pid); + if ((hash_ix >= 0) && (list_ptr == include_pid_list)) { +- // Clear dup_bind_count and interleaved flag, +- // in case user wants it to be re-evaluated soon +- process_hash_table[hash_ix].dup_bind_count = 0; ++ // Clear interleaved flag, in case user wants it to be re-evaluated + process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED; + } + } +@@ -678,18 +719,23 @@ void print_version_and_exit(char *prog_n + + void print_usage_and_exit(char *prog_name) { + fprintf(stderr, "Usage: %s ...\n", prog_name); ++ fprintf(stderr, "-C 1 to count inactive file cache as available memory (default 1)\n"); ++ fprintf(stderr, "-C 0 to count inactive file cache memory as unavailable (default 1)\n"); + fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n"); +- fprintf(stderr, "-D to specify cgroup mount point\n"); + fprintf(stderr, "-h to print this usage info\n"); ++ fprintf(stderr, "-H to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS); + fprintf(stderr, "-i [:] to specify interval seconds\n"); +- fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes\n"); +- fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes\n"); +- fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7)\n"); ++ fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes (default 0)\n"); ++ fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes (default 0)\n"); ++ fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7 -- default 5)\n"); ++ fprintf(stderr, "-m to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT); + fprintf(stderr, "-p to add PID to inclusion pid list\n"); + fprintf(stderr, "-r to remove PID from explicit pid lists\n"); +- fprintf(stderr, "-S 1 to scan all processes\n"); +- fprintf(stderr, "-S 0 to scan only explicit PID list processes\n"); +- fprintf(stderr, "-u to specify target utilization percent (default 85)\n"); ++ fprintf(stderr, "-R to reserve some CPUs for non-numad use\n"); ++ fprintf(stderr, "-S 1 to scan all processes (default 1)\n"); ++ fprintf(stderr, "-S 0 to scan only explicit PID list processes (default 1)\n"); ++ fprintf(stderr, "-t to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT); ++ fprintf(stderr, "-u to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT); + fprintf(stderr, "-v for verbose (same effect as '-l 6')\n"); + fprintf(stderr, "-V to show version info\n"); + fprintf(stderr, "-w [:] for NUMA node suggestions\n"); +@@ -698,62 +744,35 @@ void print_usage_and_exit(char *prog_nam + } + + +-void check_prereqs(char *prog_name) { +- // Verify cpusets are available on this system. +- char **dir = &cpuset_dir_list[0]; +- if (*dir == NULL) { dir++; } +- while (*dir != NULL) { +- cpuset_dir = *dir; +- char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_dir); +- if (access(fname, F_OK) == 0) { +- break; +- } +- dir++; +- } +- if (*dir == NULL) { +- fprintf(stderr, "\n"); +- fprintf(stderr, "Are CPUSETs enabled on this system?\n"); +- fprintf(stderr, "They are required for %s to function.\n\n", prog_name); +- fprintf(stderr, "Check manpage CPUSET(7). You might need to do something like:\n"); +- fprintf(stderr, " # mkdir \n"); +- fprintf(stderr, " # mount cgroup -t cgroup -o cpuset \n"); +- fprintf(stderr, " where is something like:\n"); +- dir = &cpuset_dir_list[0]; +- if (*dir == NULL) { dir++; } +- while (*dir != NULL) { +- fprintf(stderr, " - %s\n", *dir); +- dir++; +- } +- fprintf(stderr, "and then try again...\n"); +- fprintf(stderr, "Or, use '-D ' to specify the correct mount point\n"); +- fprintf(stderr, "\n"); +- exit(EXIT_FAILURE); ++void set_thp_scan_sleep_ms(int new_ms) { ++ if (new_ms < 1) { ++ // 0 means do not change the system default ++ return; + } +- // Check on THP scan sleep time. +- char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs"; +- int fd = open(thp_scan_fname, O_RDONLY, 0); ++ char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs"; ++ int fd = open(thp_scan_fname, O_RDWR, 0); + if (fd >= 0) { +- int ms; + char buf[BUF_SIZE]; + int bytes = read(fd, buf, BUF_SIZE); +- close(fd); + if (bytes > 0) { ++ buf[bytes] = '\0'; ++ int cur_ms; + char *p = buf; +- CONVERT_DIGITS_TO_NUM(p, ms); +- if (ms > 150) { +- fprintf(stderr, "\n"); +- numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); +- fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); +- fprintf(stderr, "Consider increasing the frequency of THP scanning,\n"); +- fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname); +- fprintf(stderr, "to more aggressively (re)construct THPs. For example:\n"); +- fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n"); +- fprintf(stderr, "\n"); ++ CONVERT_DIGITS_TO_NUM(p, cur_ms); ++ if (cur_ms != new_ms) { ++ lseek(fd, 0, SEEK_SET); ++ numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms); ++ sprintf(buf, "%d\n", new_ms); ++ write(fd, buf, strlen(buf)); + } + } ++ close(fd); + } +- // FIXME: ?? check for enabled ksmd, and recommend disabling ksm? ++} ++ ++void check_prereqs(char *prog_name) { ++ // Adjust kernel tunable to scan for THP more frequently... ++ set_thp_scan_sleep_ms(thp_scan_sleep_ms); + } + + +@@ -785,7 +804,6 @@ int get_daemon_pid() { + return pid; + } + +- + int register_numad_pid() { + int pid; + char buf[BUF_SIZE]; +@@ -831,6 +849,43 @@ fail_numad_run_file: + } + + ++int count_set_bits_in_hex_list_file(char *fname) { ++ int sum = 0; ++ int fd = open(fname, O_RDONLY, 0); ++ if (fd >= 0) { ++ char buf[BUF_SIZE]; ++ int bytes = read(fd, buf, BUF_SIZE); ++ close(fd); ++ for (int ix = 0; (ix < bytes); ix++) { ++ char c = tolower(buf[ix]); ++ switch (c) { ++ case '0' : sum += 0; break; ++ case '1' : sum += 1; break; ++ case '2' : sum += 1; break; ++ case '3' : sum += 2; break; ++ case '4' : sum += 1; break; ++ case '5' : sum += 2; break; ++ case '6' : sum += 2; break; ++ case '7' : sum += 3; break; ++ case '8' : sum += 1; break; ++ case '9' : sum += 2; break; ++ case 'a' : sum += 2; break; ++ case 'b' : sum += 3; break; ++ case 'c' : sum += 2; break; ++ case 'd' : sum += 3; break; ++ case 'e' : sum += 3; break; ++ case 'f' : sum += 4; break; ++ case ' ' : sum += 0; break; ++ case ',' : sum += 0; break; ++ case '\n' : sum += 0; break; ++ default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE); ++ } ++ } ++ } ++ return sum; ++} ++ ++ + int get_num_cpus() { + int n1 = sysconf(_SC_NPROCESSORS_CONF); + int n2 = sysconf(_SC_NPROCESSORS_ONLN); +@@ -848,7 +903,7 @@ int get_num_cpus() { + int get_num_kvm_vcpu_threads(int pid) { + // Try to return the number of vCPU threads for this VM guest, + // excluding the IO threads. All failures return MAXINT. +- // FIXME: figure out some better way to do this... ++ // FIXME: someday figure out some better way to do this... + char fname[FNAME_SIZE]; + snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid); + int fd = open(fname, O_RDONLY, 0); +@@ -876,8 +931,8 @@ int get_num_kvm_vcpu_threads(int pid) { + } + + +-int get_huge_page_size_in_bytes() { +- int huge_page_size = 0;; ++uint64_t get_huge_page_size_in_bytes() { ++ uint64_t huge_page_size = 0;; + FILE *fs = fopen("/proc/meminfo", "r"); + if (!fs) { + numad_log(LOG_CRIT, "Can't open /proc/meminfo\n"); +@@ -890,7 +945,7 @@ int get_huge_page_size_in_bytes() { + while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) { + p++; + } +- huge_page_size = atoi(p); ++ huge_page_size = atol(p); + break; + } + } +@@ -916,143 +971,134 @@ static int name_starts_with_digit(const + } + + +-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) { +- // Check basic parameter validity. +- if (pid <= 0) { ++ ++#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long)) ++#define SET_BIT(i,a) (a)[(i) / BITS_IN_LONG] |= (1u << ((i) % BITS_IN_LONG)) ++#define TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] & (1u << ((i) % BITS_IN_LONG))) != 0) ++#define CLEAR_BIT(i,a) (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG)) ++ ++int bind_process_and_migrate_memory(process_data_p p) { ++ uint64_t t0 = get_time_stamp(); ++ // Parameter p is a pointer to an element in the hash table ++ if ((!p) || (p->pid < 1)) { + numad_log(LOG_CRIT, "Bad PID to bind\n"); + exit(EXIT_FAILURE); + } +- if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) { +- numad_log(LOG_CRIT, "Bad cpuset name to bind\n"); +- exit(EXIT_FAILURE); +- } +- int nodes; +- if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) { +- numad_log(LOG_CRIT, "Cannot bind to unspecified node\n"); ++ if (!p->node_list_p) { ++ numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n"); + exit(EXIT_FAILURE); + } +- // Cpu_list_p is optional and may be NULL... +- // Generate CPU id list from the specified node list if necessary +- if (cpu_list_p == NULL) { +- static id_list_p tmp_cpu_list_p; +- CLEAR_LIST(tmp_cpu_list_p); +- int node_id = 0; +- while (nodes) { +- if (ID_IS_IN_LIST(node_id, node_list_p)) { +- OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p); +- nodes -= 1; +- } +- node_id += 1; +- } +- cpu_list_p = tmp_cpu_list_p; +- } +- // Make the cpuset directory if necessary +- char cpuset_name_buf[FNAME_SIZE]; +- snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name); +- char *p = &cpuset_name_buf[strlen(cpuset_dir)]; +- if (!strcmp(p, "/")) { +- // Make a cpuset directory for this process +- snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid); +- numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf); +- int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +- if (rc == -1) { +- numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno); +- return 0; ++ // Generate CPU list derived from target node list. ++ static id_list_p cpu_bind_list_p; ++ CLEAR_CPU_LIST(cpu_bind_list_p); ++ int nodes = NUM_IDS_IN_LIST(p->node_list_p); ++ int node_id = 0; ++ while (nodes) { ++ if (ID_IS_IN_LIST(node_id, p->node_list_p)) { ++ OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p); ++ nodes -= 1; + } ++ node_id += 1; + } +- cpuset_name = cpuset_name_buf; +- // Now that we have a cpuset for pid and a populated cpulist, +- // start the actual binding and migration. +- uint64_t t0 = get_time_stamp(); +- +- // Write "1" out to cpuset.memory_migrate file + char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name); +- int fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno); +- return 0; +- } +- write(fd, "1", 1); +- close(fd); +- +- // Write node IDs out to cpuset.mems file +- char node_list_buf[BUF_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno); +- return 0; +- } +- int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p); +- write(fd, node_list_buf, len); +- close(fd); +- +- // Write CPU IDs out to cpuset.cpus file +- char cpu_list_buf[BUF_SIZE]; +- snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno); +- return 0; +- } +- len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p); +- write(fd, cpu_list_buf, len); +- close(fd); +- +- // Copy pid tasks one at a time to tasks file +- snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name); +- fd = open(fname, O_WRONLY | O_TRUNC, 0); +- if (fd == -1) { +- numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno); +- return 0; +- } +- snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid); + struct dirent **namelist; +- int files = scandir(fname, &namelist, name_starts_with_digit, NULL); +- if (files < 0) { +- numad_log(LOG_WARNING, "Could not scandir task list\n"); ++ snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid); ++ int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL); ++ if (num_tasks <= 0) { ++ numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid); + return 0; // Assume the process terminated + } +- for (int ix = 0; (ix < files); ix++) { +- // copy pid tasks, one at a time +- numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name); +- write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)); +- free(namelist[ix]); ++ // Set the affinity of each task in the process... ++ for (int namelist_ix = 0; (namelist_ix < num_tasks); namelist_ix++) { ++ int tid = atoi(namelist[namelist_ix]->d_name); ++ int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p)); ++ if (rc < 0) { ++ // Check errno ++ if (errno == ESRCH) { ++ numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid); ++ } ++ numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno); ++ } ++ free(namelist[namelist_ix]); + } + free(namelist); +- close(fd); +- +- uint64_t t1 = get_time_stamp(); ++ // Now move the memory to the target nodes.... ++ static unsigned long *dest_mask; ++ static unsigned long *from_mask; ++ static int allocated_bytes_in_masks; ++ // Lie about num_nodes being one bigger because of kernel bug... ++ int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long); ++ if (allocated_bytes_in_masks < num_bytes_in_masks) { ++ allocated_bytes_in_masks = num_bytes_in_masks; ++ dest_mask = realloc(dest_mask, num_bytes_in_masks); ++ from_mask = realloc(from_mask, num_bytes_in_masks); ++ if ((dest_mask == NULL) || (from_mask == NULL)) { ++ numad_log(LOG_CRIT, "bit mask malloc failed\n"); ++ exit(EXIT_FAILURE); ++ } ++ } ++ // In an effort to put semi-balanced memory in each target node, move the ++ // contents from the source node with the max amount of memory to the ++ // destination node with the least amount of memory. Repeat until done. ++ int prev_from_node_id = -1; ++ for (;;) { ++ int min_dest_node_id = -1; ++ int max_from_node_id = -1; ++ for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { ++ node_id = node[node_ix].node_id; ++ if (ID_IS_IN_LIST(node_id, p->node_list_p)) { ++ if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) { ++ // The ">=" above is intentional, so we tend to move memory to higher numbered nodes ++ min_dest_node_id = node_id; ++ } ++ } else { ++ if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) { ++ max_from_node_id = node_id; ++ } ++ } ++ } ++ if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) { ++ break; ++ } ++ memset(dest_mask, 0, num_bytes_in_masks); ++ memset(from_mask, 0, num_bytes_in_masks); ++ SET_BIT(max_from_node_id, from_mask); ++ SET_BIT(min_dest_node_id, dest_mask); ++ numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id); ++ // Lie about num_nodes being one bigger because of kernel bug... ++ int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask); ++ if (rc > 2) { ++ // rc == the number of pages that could not be moved. ++ // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2. ++ numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc); ++ } else if (rc < 0) { ++ // Check errno ++ if (errno == ESRCH) { ++ numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid); ++ return 0; // Assume the process terminated ++ } ++ } ++ // Assume memory did move for current accounting purposes... ++ p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id]; ++ p->process_MBs[max_from_node_id] = 0; ++ prev_from_node_id = max_from_node_id; ++ } + // Check pid still active +- snprintf(fname, FNAME_SIZE, "/proc/%d", pid); ++ snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); + if (access(fname, F_OK) < 0) { +- numad_log(LOG_WARNING, "Could not migrate pid\n"); +- return 0; // Assume the process terminated ++ numad_log(LOG_WARNING, "Could not migrate pid %d. Apparently it went away.\n", p->pid); ++ return 0; ++ } else { ++ uint64_t t1 = get_time_stamp(); ++ p->bind_time_stamp = t1; ++ char node_list_str[BUF_SIZE]; ++ str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p); ++ numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100); ++ return 1; + } +- numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100); +- return 1; + } + + +-void show_nodes() { +- time_t ts = time(NULL); +- fprintf(log_fs, "%s", ctime(&ts)); +- fprintf(log_fs, "Nodes: %d\n", num_nodes); +- for (int ix = 0; (ix < num_nodes); ix++) { +- fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ", +- ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free); +- for (int d = 0; (d < num_nodes); d++) { +- fprintf(log_fs, "%d ", node[ix].distance[d]); +- } +- char buf[BUF_SIZE]; +- str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p); +- fprintf(log_fs, " CPUs: %s\n", buf); +- } +- fprintf(log_fs, "\n"); +- fflush(log_fs); +-} +- + + typedef struct cpu_data { + uint64_t time_stamp; +@@ -1062,10 +1108,9 @@ typedef struct cpu_data { + cpu_data_t cpu_data_buf[2]; // Two sets, to calc deltas + int cur_cpu_data_buf = 0; + +- + void update_cpu_data() { + // Parse idle percents from CPU stats in /proc/stat cpu lines +- static FILE *fs = NULL; ++ static FILE *fs; + if (fs != NULL) { + rewind(fs); + } else { +@@ -1107,14 +1152,14 @@ void update_cpu_data() { + while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice + while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system + while (!isdigit(*p)) { p++; } +- uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); } ++ uint64_t idle; ++ CONVERT_DIGITS_TO_NUM(p, idle); + cpu_data_buf[new].idle[cpu_id] = idle; + } + } + cur_cpu_data_buf = new; + } + +- + int node_and_digits(const struct dirent *dptr) { + char *p = (char *)(dptr->d_name); + if (*p++ != 'n') return 0; +@@ -1129,10 +1174,31 @@ int node_and_digits(const struct dirent + } + + ++uint64_t node_info_time_stamp = 0; + id_list_p all_cpus_list_p = NULL; + id_list_p all_nodes_list_p = NULL; +-uint64_t node_info_time_stamp = 0; ++id_list_p reserved_cpu_mask_list_p = NULL; ++char *reserved_cpu_str = NULL; + ++void show_nodes() { ++ fprintf(log_fs, "\n"); ++ numad_log(LOG_INFO, "Nodes: %d\n", num_nodes); ++ fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", ++ min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free); ++ fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", ++ min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free); ++ for (int ix = 0; (ix < num_nodes); ix++) { ++ fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ", ++ ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free); ++ for (int d = 0; (d < num_nodes); d++) { ++ fprintf(log_fs, "%d ", node[ix].distance[d]); ++ } ++ char buf[BUF_SIZE]; ++ str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p); ++ fprintf(log_fs, " CPUs: %s\n", buf); ++ } ++ fflush(log_fs); ++} + + int update_nodes() { + char fname[FNAME_SIZE]; +@@ -1141,6 +1207,7 @@ int update_nodes() { + uint64_t time_stamp = get_time_stamp(); + #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED) + if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) { ++ node_info_time_stamp = time_stamp; + // Count directory names of the form: /sys/devices/system/node/node + struct dirent **namelist; + int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL); +@@ -1167,8 +1234,15 @@ int update_nodes() { + } + num_nodes = num_files; + } +- CLEAR_LIST(all_cpus_list_p); +- CLEAR_LIST(all_nodes_list_p); ++ sum_CPUs_total = 0; ++ CLEAR_CPU_LIST(all_cpus_list_p); ++ CLEAR_NODE_LIST(all_nodes_list_p); ++ // Figure out how many threads per core there are (for later discounting of hyper-threads) ++ threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings"); ++ if (threads_per_core < 1) { ++ numad_log(LOG_CRIT, "Could not count threads per core\n"); ++ exit(EXIT_FAILURE); ++ } + // For each "node" filename present, save in node[ix].node_id + // Note that the node id might not necessarily match the node ix. + // Also populate the cpu lists and distance vectors for this node. +@@ -1184,11 +1258,24 @@ int update_nodes() { + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id); + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { ++ buf[BIG_BUF_SIZE - 1] = '\0'; + // get cpulist from the cpulist string +- CLEAR_LIST(node[node_ix].cpu_list_p); ++ CLEAR_CPU_LIST(node[node_ix].cpu_list_p); + int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf); ++ if (reserved_cpu_str != NULL) { ++ AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p); ++ n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); ++ } + OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p); +- node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ // Calculate total CPUs, but possibly discount hyper-threads ++ if ((threads_per_core == 1) || (htt_percent >= 100)) { ++ node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ } else { ++ n /= threads_per_core; ++ node[node_ix].CPUs_total = n * ONE_HUNDRED; ++ node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent; ++ } ++ sum_CPUs_total += node[node_ix].CPUs_total; + close(fd); + } else { + numad_log(LOG_CRIT, "Could not get node cpu list\n"); +@@ -1220,15 +1307,30 @@ int update_nodes() { + } + free(namelist); + } +- // Second, get the dynamic free memory and available CPU capacity ++ // Second, update the dynamic free memory and available CPU capacity ++ while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) { ++ // Make sure at least 7/100 of a second has passed. ++ // Otherwise sleep for 1/10 second. ++ struct timespec ts = { 0, 100000000 }; ++ nanosleep(&ts, &ts); ++ time_stamp = get_time_stamp(); ++ } + update_cpu_data(); ++ max_node_MBs_free = 0; ++ max_node_CPUs_free = 0; ++ min_node_MBs_free = MAXINT; ++ min_node_CPUs_free = MAXINT; ++ uint64_t sum_of_node_MBs_free = 0; ++ uint64_t sum_of_node_CPUs_free = 0; + for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { + int node_id = node[node_ix].node_id; + // Get available memory info from node/meminfo file + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id); + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { ++ close(fd); + uint64_t KB; ++ buf[BIG_BUF_SIZE - 1] = '\0'; + char *p = strstr(buf, "MemTotal:"); + if (p != NULL) { + p += 9; +@@ -1238,7 +1340,11 @@ int update_nodes() { + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); +- node[node_ix].MBs_total = KB / KILOBYTE; ++ node[node_ix].MBs_total = (KB / KILOBYTE); ++ if (node[node_ix].MBs_total < 1) { ++ // If a node has zero memory, remove it from the all_nodes_list... ++ CLR_ID_IN_LIST(node_id, all_nodes_list_p); ++ } + p = strstr(p, "MemFree:"); + if (p != NULL) { + p += 8; +@@ -1248,8 +1354,28 @@ int update_nodes() { + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); +- node[node_ix].MBs_free = KB / KILOBYTE; +- close(fd); ++ node[node_ix].MBs_free = (KB / KILOBYTE); ++ if (use_inactive_file_cache) { ++ // Add inactive file cache quantity to "free" memory ++ p = strstr(p, "Inactive(file):"); ++ if (p != NULL) { ++ p += 15; ++ } else { ++ numad_log(LOG_CRIT, "Could not get node Inactive(file)\n"); ++ exit(EXIT_FAILURE); ++ } ++ while (!isdigit(*p)) { p++; } ++ CONVERT_DIGITS_TO_NUM(p, KB); ++ node[node_ix].MBs_free += (KB / KILOBYTE); ++ } ++ sum_of_node_MBs_free += node[node_ix].MBs_free; ++ if (min_node_MBs_free > node[node_ix].MBs_free) { ++ min_node_MBs_free = node[node_ix].MBs_free; ++ min_node_MBs_free_ix = node[node_ix].node_id; ++ } ++ if (max_node_MBs_free < node[node_ix].MBs_free) { ++ max_node_MBs_free = node[node_ix].MBs_free; ++ } + } else { + numad_log(LOG_CRIT, "Could not get node meminfo\n"); + exit(EXIT_FAILURE); +@@ -1260,7 +1386,8 @@ int update_nodes() { + if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) { + uint64_t idle_ticks = 0; + int cpu = 0; +- int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED; ++ int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); ++ int num_cpus_to_process = num_lcpus; + while (num_cpus_to_process) { + if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) { + idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu] +@@ -1274,15 +1401,46 @@ int update_nodes() { + // printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks); + // assert(time_diff > 0); + node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff; ++ // Possibly discount hyper-threads ++ if ((threads_per_core > 1) && (htt_percent < 100)) { ++ uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent); ++ if (node[node_ix].CPUs_free > htt_discount) { ++ node[node_ix].CPUs_free -= htt_discount; ++ } else { ++ node[node_ix].CPUs_free = 0; ++ } ++ } + if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) { + node[node_ix].CPUs_free = node[node_ix].CPUs_total; + } ++ sum_of_node_CPUs_free += node[node_ix].CPUs_free; ++ if (min_node_CPUs_free > node[node_ix].CPUs_free) { ++ min_node_CPUs_free = node[node_ix].CPUs_free; ++ min_node_CPUs_free_ix = node[node_ix].node_id; ++ } ++ if (max_node_CPUs_free < node[node_ix].CPUs_free) { ++ max_node_CPUs_free = node[node_ix].CPUs_free; ++ } + node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free; + } else { + node[node_ix].CPUs_free = 0; + node[node_ix].magnitude = 0; + } + } ++ avg_node_MBs_free = sum_of_node_MBs_free / num_nodes; ++ avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes; ++ double MBs_variance_sum = 0.0; ++ double CPUs_variance_sum = 0.0; ++ for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { ++ double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free; ++ double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free; ++ MBs_variance_sum += MBs_diff * MBs_diff; ++ CPUs_variance_sum += CPUs_diff * CPUs_diff; ++ } ++ double MBs_variance = MBs_variance_sum / (num_nodes); ++ double CPUs_variance = CPUs_variance_sum / (num_nodes); ++ stddev_node_MBs_free = sqrt(MBs_variance); ++ stddev_node_CPUs_free = sqrt(CPUs_variance); + if (log_level >= LOG_INFO) { + show_nodes(); + } +@@ -1316,7 +1474,7 @@ typedef struct stat_data { + int64_t num_threads; // 19 + int64_t itrealvalue; + uint64_t starttime; +- uint64_t vsize; ++ uint64_t vsize; // 22 + int64_t rss; // 23 + uint64_t rsslim; + uint64_t startcode; +@@ -1356,15 +1514,16 @@ process_data_p get_stat_data_for_pid(int + } + static char buf[BUF_SIZE]; + int bytes = read(fd, buf, BUF_SIZE); ++ close(fd); + if (bytes < 50) { + numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname); + return NULL; + } +- close(fd); ++ uint64_t val; + char *p = buf; + static process_data_t data; + // Get PID from field 0 +- uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.pid = val; + // Copy comm from field 1 + while (*p == ' ') { p++; } +@@ -1373,23 +1532,27 @@ process_data_p get_stat_data_for_pid(int + // Skip fields 2 through 12 + for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } + // Get utime from field 13 for cpu_util +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.cpu_util = val; + // Get stime from field 14 to add on to cpu_util (which already has utime) + while (*p == ' ') { p++; } +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.cpu_util += val; + // Skip fields 15 through 18 + while (*p == ' ') { p++; } + for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } + // Get num_threads from field 19 +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.num_threads = val; +- // Skip fields 20 through 22 ++ // Skip fields 20 through 21 + while (*p == ' ') { p++; } +- for (int ix = 0; (ix < 3); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } ++ for (int ix = 0; (ix < 2); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } ++ // Get vsize from field 22 to compute MBs_size ++ CONVERT_DIGITS_TO_NUM(p, val); ++ data.MBs_size = val / MEGABYTE; + // Get rss from field 23 to compute MBs_used +- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } ++ while (*p == ' ') { p++; } ++ CONVERT_DIGITS_TO_NUM(p, val); + data.MBs_used = (val * page_size_in_bytes) / MEGABYTE; + // Return pointer to data + return &data; +@@ -1471,446 +1634,409 @@ int update_processes() { + } + + ++int initialize_mem_node_list(process_data_p p) { ++ // Parameter p is a pointer to an element in the hash table ++ if ((!p) || (p->pid < 1)) { ++ numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n"); ++ exit(EXIT_FAILURE); ++ } ++ int n = 0; ++ char fname[FNAME_SIZE]; ++ char buf[BIG_BUF_SIZE]; ++ p->process_MBs = NULL; ++ CLEAR_NODE_LIST(p->node_list_p); ++ snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid); ++ int fd = open(fname, O_RDONLY, 0); ++ if (fd < 0) { ++ numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid); ++ return 0; // Assume the process terminated ++ } ++ int bytes = read(fd, buf, BIG_BUF_SIZE); ++ close(fd); ++ if (bytes <= 0) { ++ numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid); ++ return 0; // Assume the process terminated ++ } else if (bytes >= BIG_BUF_SIZE) { ++ buf[BIG_BUF_SIZE - 1] = '\0'; ++ } else { ++ buf[bytes] = '\0'; ++ } ++ char *list_str_p = strstr(buf, "Mems_allowed_list:"); ++ if (!list_str_p) { ++ numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n"); ++ exit(EXIT_FAILURE); ++ } ++ list_str_p += 18; ++ while (!isdigit(*list_str_p)) { list_str_p++; } ++ n = add_ids_to_list_from_str(p->node_list_p, list_str_p); ++ if (n < num_nodes) { ++ // If process already bound to a subset of nodes when we discover it, ++ // set initial bind_time_stamp to 30 minutes ago... ++ p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED); ++ } ++ return n; ++} + +-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { +- char buf[BUF_SIZE]; +- char buf2[BUF_SIZE]; ++ ++uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) { ++ int64_t needed_mem; ++ int64_t needed_cpu; ++ int64_t excess_mem; ++ int64_t excess_cpu; ++ if (MBs_free > mbs) { ++ needed_mem = mbs; ++ excess_mem = MBs_free - mbs; ++ } else { ++ needed_mem = MBs_free; ++ excess_mem = 0; ++ } ++ if (CPUs_free > cpus) { ++ needed_cpu = cpus; ++ excess_cpu = CPUs_free - cpus; ++ } else { ++ needed_cpu = CPUs_free; ++ excess_cpu = 0; ++ } ++ // Weight the available resources, and then calculate magnitude as ++ // product of available CPUs and available MBs. ++ int64_t memfactor = (needed_mem * 10 + excess_mem * 4); ++ int64_t cpufactor = (needed_cpu * 6 + excess_cpu * 1); ++ numad_log(LOG_DEBUG, " Node[%d]: mem: %ld cpu: %ld\n", ix, memfactor, cpufactor); ++ return (memfactor * cpufactor); ++} ++ ++ ++id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) { + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs); + } +- int num_existing_mems = 0; +- static id_list_p existing_mems_list_p; +- CLEAR_LIST(existing_mems_list_p); +- uint64_t time_stamp = get_time_stamp(); +- static node_data_p tmp_node; +- static uint64_t *process_MBs; +- static uint64_t *saved_magnitude_for_node; +- static int process_MBs_num_nodes; +- // See if dynamic structures need to grow. +- if (process_MBs_num_nodes < num_nodes + 1) { +- process_MBs_num_nodes = num_nodes + 1; +- // The "+1 node" is for accumulating interleaved memory +- process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t)); +- tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); +- saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t)); +- if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) { +- numad_log(LOG_CRIT, "process_MBs realloc failed\n"); +- exit(EXIT_FAILURE); +- } +- } ++ char buf[BUF_SIZE]; ++ uint64_t proc_avg_node_CPUs_free = 0; + // For existing processes, get miscellaneous process specific details + int pid_ix; + process_data_p p = NULL; + if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) { + p = &process_hash_table[pid_ix]; +- // Quick rejection if this process has interleaved memory, but recheck it once an hour... +-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED) +- if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) +- && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); +- } +- return NULL; +- } +- // Get cpuset name for this process, and existing mems binding, if any. ++ // Add up per-node memory in use by this process. ++ // This scanning is expensive and should be minimized. + char fname[FNAME_SIZE]; +- snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid); +- FILE *fs = fopen(fname, "r"); +- if (!fs) { +- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid); +- return NULL; // Assume the process terminated? +- } +- if (!fgets(buf, BUF_SIZE, fs)) { +- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid); +- fclose(fs); +- return NULL; // Assume the process terminated? +- } +- fclose(fs); +- ELIM_NEW_LINE(buf); +- if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) { +- if (p->cpuset_name != NULL) { +- free(p->cpuset_name); +- } +- p->cpuset_name = strdup(buf); +- } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name); +- } +- snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name); +- fs = fopen(fname, "r"); +- if ((fs) && (fgets(buf, BUF_SIZE, fs))) { +- fclose(fs); +- num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf); +- if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf); +- } +- } +- // If this process was just recently bound, enforce a minimum delay +- // period between repeated attempts to potentially move the memory. +- // FIXME: ?? might this retard appropriate process expansion too much? +-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED) +- if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { +- // Skip re-evaluation because we just did it recently. +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n"); +- } +- return NULL; +- } +- // Look for short cut because of duplicate bindings. If we have bound +- // this process to the same nodes multiple times already, and the load +- // on those nodes still seems acceptable, skip the rest of this and +- // just return NULL to indicate no change needed. FIXME: should figure +- // out what can change that would make a rebinding desirable (e.g. (1) +- // some process gets sub-optimal allocation on busy machine which +- // subsequently becomes less busy leaving disadvantaged process. (2) +- // node load imbalance, (3) any process split across nodes which should +- // fit within a single node.) For now, just expire the dup_bid_count +- // occasionally, which is a reasonably good mitigation. +- // So, check to see if we should decay the dup_bind_count... +-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED) +- if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) { +- p->dup_bind_count -= 1; +- } +- // Now, look for short cut because of duplicate bindings +- if (p->dup_bind_count > 0) { +- int node_id = 0; +- int nodes_have_cpu = 1; +- int nodes_have_ram = 1; +- int n = num_existing_mems; +- int min_resource_pct = 100 - target_utilization; +- if (min_resource_pct < 5) { +- min_resource_pct = 5; +- } +- while (n) { +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct)); +- nodes_have_ram &= ((100 * node[node_id].MBs_free / node[node_id].MBs_total) >= (min_resource_pct)); +- n -= 1; +- } +- node_id += 1; +- } +- if ((nodes_have_cpu) && (nodes_have_ram)) { +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n"); +- } +- return NULL; +- } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram); +- } +- } +- // Fourth, add up per-node memory in use by this process. This scanning +- // is expensive and should be minimized. Also, old kernels dismantle +- // transparent huge pages while producing the numa_maps memory +- // information! +- memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t)); + snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid); +- fs = fopen(fname, "r"); ++ FILE *fs = fopen(fname, "r"); + if (!fs) { + numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid); + return NULL; // Assume the process terminated + } ++ // Allocate and zero per node memory array. ++ // The "+1 node" is for accumulating interleaved memory ++ p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t)); ++ if (p->process_MBs == NULL) { ++ numad_log(LOG_CRIT, "p->process_MBs realloc failed\n"); ++ exit(EXIT_FAILURE); ++ } ++ memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t)); + int process_has_interleaved_memory = 0; + while (fgets(buf, BUF_SIZE, fs)) { + int interleaved_memory = 0; + uint64_t page_size = page_size_in_bytes; + const char *delimiters = " \n"; +- char *p = strtok(buf, delimiters); +- while (p) { +- if (!strncmp(p, "interleave", 10)) { ++ char *str_p = strtok(buf, delimiters); ++ while (str_p) { ++ if (!strncmp(str_p, "interleave", 10)) { + interleaved_memory = 1; + process_has_interleaved_memory = 1; +- } else if (!strcmp(p, "huge")) { ++ } else if (!strcmp(str_p, "huge")) { + page_size = huge_page_size_in_bytes; +- } else if (*p++ == 'N') { ++ } else if (*str_p++ == 'N') { + int node; + uint64_t pages; +- CONVERT_DIGITS_TO_NUM(p, node); +- if (*p++ != '=') { ++ CONVERT_DIGITS_TO_NUM(str_p, node); ++ if (*str_p++ != '=') { + numad_log(LOG_CRIT, "numa_maps node number parse error\n"); + exit(EXIT_FAILURE); + } +- CONVERT_DIGITS_TO_NUM(p, pages); +- process_MBs[node] += (pages * page_size); ++ CONVERT_DIGITS_TO_NUM(str_p, pages); ++ p->process_MBs[node] += (pages * page_size); + if (interleaved_memory) { + // sum interleaved quantity in "extra node" +- process_MBs[num_nodes] += (pages * page_size); ++ p->process_MBs[num_nodes] += (pages * page_size); + } + } + // Get next token on the line +- p = strtok(NULL, delimiters); ++ str_p = strtok(NULL, delimiters); + } + } + fclose(fs); ++ proc_avg_node_CPUs_free = p->CPUs_used; + for (int ix = 0; (ix <= num_nodes); ix++) { +- process_MBs[ix] /= MEGABYTE; +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]); ++ p->process_MBs[ix] /= MEGABYTE; ++ if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) { ++ if (ix == num_nodes) { ++ numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]); ++ } else { ++ numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]); ++ } ++ } ++ if (ID_IS_IN_LIST(ix, p->node_list_p)) { ++ proc_avg_node_CPUs_free += node[ix].CPUs_free; + } + } ++ proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p); + if ((process_has_interleaved_memory) && (keep_interleaved_memory)) { + // Mark this process as having interleaved memory so we do not +- // merge the interleaved memory. Time stamp it as done. ++ // merge the interleaved memory. Time stamp it as done and return. + p->flags |= PROCESS_FLAG_INTERLEAVED; + p->bind_time_stamp = get_time_stamp(); + if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); + } + return NULL; + } + } // end of existing PID conditional + // Make a copy of node available resources array. Add in info specific to + // this process to equalize available resource quantities wrt locations of +- // resources already in use by this process. Inflate the value of already +- // assigned memory by approximately 3/2, because moving memory is +- // expensive. Average the amount of CPUs_free across the existing nodes +- // used, because the threads are free to move around in that domain. After +- // calculating combined magnitude of available resources, bias the values +- // towards existing locations for this process. +- int target_using_all_nodes = 0; +- uint64_t node_CPUs_free_for_this_process = 0; +- memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); +- if (num_existing_mems > 0) { +- node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation? +- int node_id = 0; +- int n = num_existing_mems; +- while (n) { +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free; +- n -= 1; +- } +- node_id += 1; +- } +- // Divide to get average CPUs_free for the nodes in use by process +- node_CPUs_free_for_this_process /= num_existing_mems; ++ // resources already in use by this process. ++ static node_data_p tmp_node; ++ tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); ++ if (tmp_node == NULL) { ++ numad_log(LOG_CRIT, "tmp_node realloc failed\n"); ++ exit(EXIT_FAILURE); + } ++ memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); ++ uint64_t sum_of_node_CPUs_free = 0; + for (int ix = 0; (ix < num_nodes); ix++) { + if (pid > 0) { +- tmp_node[ix].MBs_free += ((process_MBs[ix] * 12) / 8); +- } +- if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) { +- tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process; +- } +- if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) { +- tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; +- } +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free); ++ if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) { ++ // Process not yet bound to a subset of nodes. ++ // Add back memory used by this process on this node. ++ tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16); // Apply light mem bias ++ // Add back CPU used by this process in proportion to the memory used on this node. ++ tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used); ++ } else { ++ // If the process is currently running on less than all the ++ // nodes, first add back (biased) memory already used by this ++ // process on this node, then assign average process CPU / node ++ // for this process iff the process is present on this node. ++ tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4); // Apply heavy mem bias ++ if (ID_IS_IN_LIST(ix, p->node_list_p)) { ++ tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free; ++ } ++ } ++ sum_of_node_CPUs_free += tmp_node[ix].CPUs_free; ++ if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) { ++ tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; ++ } ++ if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) { ++ tmp_node[ix].MBs_free = tmp_node[ix].MBs_total; ++ } + } +- // Calculate magnitude as product of available CPUs and available MBs +- tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free; +- // Bias combined magnitude towards already assigned nodes +- if (ID_IS_IN_LIST(ix, existing_mems_list_p)) { +- tmp_node[ix].magnitude *= 9; +- tmp_node[ix].magnitude /= 8; ++ // Enforce 1/100th CPU minimum ++ if (tmp_node[ix].CPUs_free < 1) { ++ tmp_node[ix].CPUs_free = 1; + } +- // Save the current magnitudes +- saved_magnitude_for_node[ix] = tmp_node[ix].magnitude; ++ // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); ++ tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); + } +- // OK, figure out where to get resources for this request. ++ // Now figure out where to get resources for this request.... + static id_list_p target_node_list_p; +- CLEAR_LIST(target_node_list_p); +- int prev_node_used = -1; +- // Continue to allocate more resources until request are met. +- // OK if not not quite all the CPU request is met. +- // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing? +- int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; +- if (pid <= 0) { +- // If trying to find resources for pre-placement advice request, do not +- // underestimate the amount of CPUs needed. Instead, err on the side +- // of providing too many resources. So, no flexing here... +- cpu_flex = 0; +- } +- while ((mbs > 0) || (cpus > cpu_flex)) { +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); ++ CLEAR_NODE_LIST(target_node_list_p); ++ if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) { ++ // System CPUs might be oversubscribed, but... ++ assume_enough_cpus = 1; ++ // and rely on available memory for placement. ++ } ++ // Establish a CPU flex fudge factor, on the presumption it is OK if not ++ // quite all the CPU request is met. However, if trying to find resources ++ // for pre-placement advice request, do not underestimate the amount of ++ // CPUs needed. Instead, err on the side of providing too many resources. ++ int cpu_flex = 0; ++ if ((pid > 0) && (target_utilization < 100)) { ++ // FIXME: Is half of the utilization margin a good amount of CPU flexing? ++ cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200; ++ } ++ // Figure out minimum number of nodes required ++ int mem_req_nodes = ceil((double)mbs / (double)node[0].MBs_total); ++ int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); ++ int min_req_nodes = mem_req_nodes; ++ if (min_req_nodes < cpu_req_nodes) { ++ min_req_nodes = cpu_req_nodes; ++ } ++ if (min_req_nodes > num_nodes) { ++ min_req_nodes = num_nodes; ++ } ++ // Use an index to sort NUMA connected resource chain for each node ++ int index[num_nodes]; ++ uint64_t totmag[num_nodes]; ++ for (int ix = 0; (ix < num_nodes); ix++) { ++ // Reset the index each time ++ for (int n = 0; (n < num_nodes); n++) { ++ index[n] = n; + } +- // Sort nodes by magnitude of available resources. Note that +- // inter-node distances (to the previous node used) are factored into +- // the sort. ++ // Sort by minimum relative NUMA distance from node[ix], ++ // breaking distance ties with magnitude of available resources + for (int ij = 0; (ij < num_nodes); ij++) { +- int big_ix = ij; ++ int best_ix = ij; + for (int ik = ij + 1; (ik < num_nodes); ik++) { +- uint64_t ik_dist = 1; +- uint64_t big_ix_dist = 1; +- if (prev_node_used >= 0) { +- ik_dist = tmp_node[ik].distance[prev_node_used]; +- big_ix_dist = tmp_node[big_ix].distance[prev_node_used]; +- } +- // Scale magnitude comparison by distances to previous node used... +- if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) { +- big_ix = ik; +- } +- } +- if (big_ix != ij) { +- node_data_t tmp; +- memcpy((void *)&tmp, (void *)&tmp_node[ij], sizeof(node_data_t) ); +- memcpy((void *)&tmp_node[ij], (void *)&tmp_node[big_ix], sizeof(node_data_t) ); +- memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) ); ++ int ik_dist = tmp_node[index[ik]].distance[ix]; ++ int best_ix_dist = tmp_node[index[best_ix]].distance[ix]; ++ if (best_ix_dist > ik_dist) { ++ best_ix = ik; ++ } else if (best_ix_dist == ik_dist) { ++ if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) { ++ best_ix = ik; ++ } ++ } ++ } ++ if (best_ix != ij) { ++ int tmp = index[ij]; ++ index[ij] = index[best_ix]; ++ index[best_ix] = tmp; + } + } ++#if 0 + if (log_level >= LOG_DEBUG) { +- for (int ix = 0; (ix < num_nodes); ix++) { +- numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude); ++ for (int iq = 0; (iq < num_nodes); iq++) { ++ numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n", ++ tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude); ++ } ++ } ++#endif ++ // Save the totmag[] sum of the magnitudes of expected needed nodes, ++ // "normalized" by NUMA distance (by dividing each magnitude by the ++ // relative distance squared). ++ totmag[ix] = 0; ++ for (int ij = 0; (ij < min_req_nodes); ij++) { ++ int dist = tmp_node[index[ij]].distance[ix]; ++ totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist)); ++ } ++ numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]); ++ } ++ // Now find the best NUMA node based on the normalized sum of node ++ // magnitudes expected to be used. ++ int best_node_ix = 0; ++ for (int ix = 0; (ix < num_nodes); ix++) { ++ if (totmag[best_node_ix] < totmag[ix]) { ++ best_node_ix = ix; ++ } ++ } ++ numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix); ++ // Reset sorting index again ++ for (int n = 0; (n < num_nodes); n++) { ++ index[n] = n; ++ } ++ // Sort index by distance from node[best_node_ix], ++ // breaking distance ties with magnitude ++ for (int ij = 0; (ij < num_nodes); ij++) { ++ int best_ix = ij; ++ for (int ik = ij + 1; (ik < num_nodes); ik++) { ++ int ik_dist = tmp_node[index[ik]].distance[best_node_ix]; ++ int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix]; ++ if (best_ix_dist > ik_dist) { ++ best_ix = ik; ++ } else if (best_ix_dist == ik_dist) { ++ if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) { ++ best_ix = ik; ++ } + } + } +- if (tmp_node[0].node_id == prev_node_used) { +- // Hmmm. Looks like the best node for more resources, is also the +- // last one we used. This is not going to make progress... So +- // just punt and use everything. +- OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p); +- target_using_all_nodes = 1; +- break; ++ if (best_ix != ij) { ++ int tmp = index[ij]; ++ index[ij] = index[best_ix]; ++ index[best_ix] = tmp; ++ } ++ } ++ if (log_level >= LOG_DEBUG) { ++ for (int iq = 0; (iq < num_nodes); iq++) { ++ numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n", ++ tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude); + } +- prev_node_used = tmp_node[0].node_id; +- ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p); ++ } ++ // Allocate more resources until request is met. ++ best_node_ix = 0; ++ while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) { + if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- str_from_id_list(buf2, BUF_SIZE, target_node_list_p); +- numad_log(LOG_DEBUG, "Existing nodes: %s Target nodes: %s\n", buf, buf2); ++ numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); + } ++ numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]); ++ ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p); ++ min_req_nodes -= 1; + if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) { + // Apparently we must use all resource nodes... +- target_using_all_nodes = 1; + break; + } +-#define MBS_MARGIN 10 +- if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) { +- tmp_node[0].MBs_free -= mbs; ++ // "Consume" the resources on this node ++#define CPUS_MARGIN 0 ++#define MBS_MARGIN 100 ++ if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) { ++ tmp_node[index[best_node_ix]].MBs_free -= mbs; + mbs = 0; + } else { +- mbs -= (tmp_node[0].MBs_free - MBS_MARGIN); +- tmp_node[0].MBs_free = MBS_MARGIN; ++ mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN); ++ tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN; + } +-#define CPUS_MARGIN 0 +- if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) { +- tmp_node[0].CPUs_free -= cpus; ++ if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) { ++ tmp_node[index[best_node_ix]].CPUs_free -= cpus; + cpus = 0; + } else { +- cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN); +- tmp_node[0].CPUs_free = CPUS_MARGIN; +- } +- tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free; +- } +- // If this existing process is already located where we want it, and almost +- // all memory is already moved to those nodes, then return NULL indicating +- // no need to change binding this time. +- if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) { +- // May not need to change binding. However, if there is any significant +- // memory still on non-target nodes, advise the bind anyway because +- // there are some scenarios when the kernel will not move it all the +- // first time. +- if (!target_using_all_nodes) { +- p->dup_bind_count += 1; +- for (int ix = 0; (ix < num_nodes); ix++) { +- if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) { +- goto try_memory_move_again; +- } +- } +- // We will accept these memory locations. Stamp it as done. +- p->bind_time_stamp = get_time_stamp(); +- } +- // Skip rebinding either because practically all memory is in the +- // target nodes, or because we are stuck using all the nodes. +- if (log_level >= LOG_DEBUG) { +- numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n"); ++ cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN); ++ tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN; + } +- return NULL; +- } else { +- // Either a non-existing process, or a new binding for an existing process. +- if (p != NULL) { +- // Must be a new binding for an existing process, so reset dup_bind_count. +- p->dup_bind_count = 0; +- } +- } +- // See if this proposed move will make a significant difference. +- // If not, return null instead of advising the move. +- uint64_t target_magnitude = 0; +- uint64_t existing_magnitude = 0; +- int num_target_nodes = NUM_IDS_IN_LIST(target_node_list_p); +- int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p); +- /* FIXME: this expansion seems to cause excessive growth +- * So calculate the improvement before hastily expanding nodes. +- if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; } +- */ +- int node_id = 0; +- int n = num_existing_nodes + num_target_nodes; +- while (n) { +- if (ID_IS_IN_LIST(node_id, target_node_list_p)) { +- target_magnitude += saved_magnitude_for_node[node_id]; +- n -= 1; +- } +- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { +- existing_magnitude += saved_magnitude_for_node[node_id]; +- n -= 1; +- } +- node_id += 1; +- } +- if (existing_magnitude > 0) { +- uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude; +- if (magnitude_change < 0) { +- magnitude_change = -(magnitude_change); +- } +- if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) { +- // Not significant enough percentage change to do rebind ++ // Next line optional, since we will not look at that node again ++ tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free); ++ best_node_ix += 1; ++ } ++ // For existing processes, calculate the non-local memory percent to see if ++ // process is already in the right place. ++ if ((pid > 0) && (p != NULL)) { ++ uint64_t nonlocal_memory = 0; ++ for (int ix = 0; (ix < num_nodes); ix++) { ++ if (!ID_IS_IN_LIST(ix, target_node_list_p)) { ++ // Accumulate total of nonlocal memory ++ nonlocal_memory += p->process_MBs[ix]; ++ } ++ } ++ int disp_percent = (100 * nonlocal_memory) / p->MBs_used; ++ // If this existing process is already located where we want it, then just ++ // return NULL indicating no need to change binding this time. Check the ++ // ammount of nonlocal memory against the target_memlocality_perecent. ++ if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) { ++ // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind + if (log_level >= LOG_DEBUG) { +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); +- str_from_id_list(buf2, BUF_SIZE, target_node_list_p); +- numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n", +- pid, buf, buf2, magnitude_change); ++ numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent); + } +- // We decided this is almost good enough. Stamp it as done. + p->bind_time_stamp = get_time_stamp(); + return NULL; + } + } +- if ((pid <= 0) && (num_target_nodes <= 0)) { +- // Always provide at least one node for pre-placement advice ++ // Must always provide at least one node for pre-placement advice ++ // FIXME: verify this can happen only if no resources requested... ++ if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) { + ADD_ID_TO_LIST(node[0].node_id, target_node_list_p); + } +-try_memory_move_again: +- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); ++ // Log advice, and return target node list ++ if ((pid > 0) && (p->bind_time_stamp)) { ++ str_from_id_list(buf, BUF_SIZE, p->node_list_p); ++ } else { ++ str_from_id_list(buf, BUF_SIZE, all_nodes_list_p); ++ } ++ char buf2[BUF_SIZE]; + str_from_id_list(buf2, BUF_SIZE, target_node_list_p); + char *cmd_name = "(unknown)"; + if ((p) && (p->comm)) { + cmd_name = p->comm; + } + numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2); ++ if (pid > 0) { ++ COPY_LIST(target_node_list_p, p->node_list_p); ++ } + return target_node_list_p; + } + + +- +-void show_processes(process_data_p *ptr, int nprocs) { +- time_t ts = time(NULL); +- fprintf(log_fs, "%s", ctime(&ts)); +- fprintf(log_fs, "Candidates: %d\n", nprocs); +- for (int ix = 0; (ix < nprocs); ix++) { +- process_data_p p = ptr[ix]; +- char buf[BUF_SIZE]; +- snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name); +- FILE *fs = fopen(buf, "r"); +- buf[0] = '\0'; +- if (fs) { +- if (fgets(buf, BUF_SIZE, fs)) { +- ELIM_NEW_LINE(buf); +- } +- fclose(fs); +- } +- fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", +- p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); +- } +- fprintf(log_fs, "\n"); +- fflush(log_fs); +-} +- +- +- + int manage_loads() { ++ uint64_t time_stamp = get_time_stamp(); + // Use temporary index to access and sort hash table entries +- static process_data_p *pindex; + static int pindex_size; ++ static process_data_p *pindex; + if (pindex_size < process_hash_table_size) { + pindex_size = process_hash_table_size; + pindex = realloc(pindex, pindex_size * sizeof(process_data_p)); +@@ -1923,19 +2049,54 @@ int manage_loads() { + return min_interval / 2; + } + memset(pindex, 0, pindex_size * sizeof(process_data_p)); +- // Copy live candidate pointers to the index for sorting, etc ++ // Copy live candidate pointers to the index for sorting ++ // if they meet the threshold for memory usage and CPU usage. + int nprocs = 0; ++ long sum_CPUs_used = 0; + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; +- if (p->pid) { ++ if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) { + pindex[nprocs++] = p; ++ sum_CPUs_used += p->CPUs_used; ++ // Initialize node list, if not already done for this process. ++ if (p->node_list_p == NULL) { ++ initialize_mem_node_list(p); ++ } + } + } +- // Sort index by amount of CPU used * amount of memory used. Not expecting +- // a long list here. Use a simple sort -- however, sort into bins, +- // treating values within 10% as aquivalent. Within bins, order by +- // bind_time_stamp so oldest bound will be higher priority to evaluate. ++ // Order candidate considerations using timestamps and magnitude: amount of ++ // CPU used * amount of memory used. Not expecting a long list here. Use ++ // a simplistic sort -- however move all not yet bound to front of list and ++ // order by decreasing magnitude. Previously bound processes follow in ++ // bins of increasing magnitude treating values within 20% as aquivalent. ++ // Within bins, order by bind_time_stamp so oldest bound will be higher ++ // priority to evaluate. Start by moving all unbound to beginning. ++ int num_unbound = 0; + for (int ij = 0; (ij < nprocs); ij++) { ++ if (pindex[ij]->bind_time_stamp == 0) { ++ process_data_p tmp = pindex[num_unbound]; ++ pindex[num_unbound++] = pindex[ij]; ++ pindex[ij] = tmp; ++ } ++ } ++ // Sort all unbound so biggest magnitude comes first ++ for (int ij = 0; (ij < num_unbound); ij++) { ++ int best = ij; ++ for (int ik = ij + 1; (ik < num_unbound); ik++) { ++ uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); ++ uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used); ++ if (ik_mag <= best_mag) continue; ++ best = ik; ++ } ++ if (best != ij) { ++ process_data_p tmp = pindex[ij]; ++ pindex[ij] = pindex[best]; ++ pindex[best] = tmp; ++ } ++ } ++ // Sort the remaining candidates into bins of increasting magnitude, and by ++ // timestamp within bins. ++ for (int ij = num_unbound; (ij < nprocs); ij++) { + int best = ij; + for (int ik = ij + 1; (ik < nprocs); ik++) { + uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); +@@ -1946,11 +2107,11 @@ int manage_loads() { + diff_mag = -(diff_mag); + min_mag = best_mag; + } +- if ((diff_mag > 0) && (min_mag / diff_mag < 10)) { +- // difference > 10 percent. Use strict ordering ++ if ((diff_mag > 0) && (min_mag / diff_mag < 5)) { ++ // difference > 20 percent. Use magnitude ordering + if (ik_mag <= best_mag) continue; + } else { +- // difference within 10 percent. Sort these by bind_time_stamp. ++ // difference within 20 percent. Sort these by bind_time_stamp. + if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue; + } + best = ik; +@@ -1961,23 +2122,57 @@ int manage_loads() { + pindex[best] = tmp; + } + } ++ // Show the candidate processes in the log file + if ((log_level >= LOG_INFO) && (nprocs > 0)) { +- show_processes(pindex, nprocs); ++ numad_log(LOG_INFO, "Candidates: %d\n", nprocs); ++ for (int ix = 0; (ix < nprocs); ix++) { ++ process_data_p p = pindex[ix]; ++ char buf[BUF_SIZE]; ++ str_from_id_list(buf, BUF_SIZE, p->node_list_p); ++ fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", ++ p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); ++ } ++ fflush(log_fs); + } +- // Estimate desired size and make resource requests for each significant process ++ // Estimate desired size (+ margin capacity) and ++ // make resource requests for each candidate process + for (int ix = 0; (ix < nprocs); ix++) { + process_data_p p = pindex[ix]; +- if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) { +- break; // No more significant processes worth worrying about... ++ // If this process has interleaved memory, recheck it only every 30 minutes... ++#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED) ++ if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) ++ && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { ++ if (log_level >= LOG_DEBUG) { ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); ++ } ++ continue; ++ } ++ // Expand resources needed estimate using target_utilization factor. ++ // Start with the CPUs actually used (capped by number of threads) for ++ // CPUs required, and the RSS MBs actually used for the MBs ++ // requirement, ++ int mem_target_utilization = target_utilization; ++ int cpu_target_utilization = target_utilization; ++ // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe) ++ if (mem_target_utilization > 100) { ++ mem_target_utilization = 100; ++ } ++ // If the process virtual memory size is bigger than one node, and it ++ // is already using more than 80 percent of a node, then request MBs ++ // based on the virtual size rather than on the current amount in use. ++ int mb_request; ++ if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) { ++ mb_request = (p->MBs_size * 100) / mem_target_utilization; ++ } else { ++ mb_request = (p->MBs_used * 100) / mem_target_utilization; + } +- int mb_request = (p->MBs_used * 100) / target_utilization; +- int cpu_request = (p->CPUs_used * 100) / target_utilization; +- // Do not give a process more CPUs than it has threads! +- // FIXME: For guest VMs, should limit max to VCPU threads. Will +- // need to do something more intelligent with guest IO threads +- // when eventually considering devices and IRQs. ++ int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization; ++ // But do not give a process more CPUs than it has threads! + int thread_limit = p->num_threads; +- // If process looks like a KVM guest, try to limit to number of vCPU threads ++ // If process looks like a KVM guest, try to limit thread count to the ++ // number of vCPU threads. FIXME: Will need to do something more ++ // intelligent than this with guest IO threads when eventually ++ // considering devices and IRQs. + if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) { + int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid); + if (thread_limit > kvm_vcpu_threads) { +@@ -1988,23 +2183,51 @@ int manage_loads() { + if (cpu_request > thread_limit) { + cpu_request = thread_limit; + } ++ // If this process was recently bound, enforce a five-minute minimum ++ // delay between repeated attempts to potentially move the process. ++#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED) ++ if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { ++ // Skip re-evaluation because we just did it recently, but check ++ // first for node utilization balance to see if we should ++ // re-evaluate this particular process right now. If this process ++ // is running on one of the busiest nodes, go ahead and re-evaluate ++ // it if it looks like it should have a better place with ++ // sufficient resources. FIXME: this is currently implemented for ++ // only smallish processes that will fit in a single node. ++ if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p)) ++ && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total) ++ && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free) ++ + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free) ++ < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD) // CPU slop ++ && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free) ++ + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free) ++ < (max_node_MBs_free - min_node_MBs_free)) ) { ++ if (log_level >= LOG_DEBUG) { ++ numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid); ++ } ++ } else { ++ if (log_level >= LOG_DEBUG) { ++ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid); ++ } ++ continue; ++ } ++ } ++ // OK, now pick NUMA nodes for this process and bind it! + pthread_mutex_lock(&node_info_mutex); +- id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request); +- // FIXME: ?? copy node_list_p to shorten mutex region? +- if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) { +- // Shorten interval if actively moving processes ++ int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total); ++ id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus); ++ if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) { + pthread_mutex_unlock(&node_info_mutex); +- p->bind_time_stamp = get_time_stamp(); ++ // Return minimum interval when actively moving processes + return min_interval; + } + pthread_mutex_unlock(&node_info_mutex); + } +- // Return maximum interval if no process movement ++ // Return maximum interval when no process movement + return max_interval; + } + + +- + void *set_dynamic_options(void *arg) { + // int arg_value = *(int *)arg; + char buf[BUF_SIZE]; +@@ -2013,6 +2236,18 @@ void *set_dynamic_options(void *arg) { + msg_t msg; + recv_msg(&msg); + switch (msg.body.cmd) { ++ case 'C': ++ use_inactive_file_cache = (msg.body.arg1 != 0); ++ if (use_inactive_file_cache) { ++ numad_log(LOG_NOTICE, "Counting inactive file cache as available\n"); ++ } else { ++ numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n"); ++ } ++ break; ++ case 'H': ++ thp_scan_sleep_ms = msg.body.arg1; ++ set_thp_scan_sleep_ms(thp_scan_sleep_ms); ++ break; + case 'i': + min_interval = msg.body.arg1; + max_interval = msg.body.arg2; +@@ -2033,6 +2268,10 @@ void *set_dynamic_options(void *arg) { + numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); + log_level = msg.body.arg1; + break; ++ case 'm': ++ numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1); ++ target_memlocality = msg.body.arg1; ++ break; + case 'p': + numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); +@@ -2055,6 +2294,11 @@ void *set_dynamic_options(void *arg) { + numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); + } + break; ++ case 't': ++ numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1); ++ htt_percent = msg.body.arg1; ++ node_info_time_stamp = 0; // to force rescan of nodes/cpus soon ++ break; + case 'u': + numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); + target_utilization = msg.body.arg1; +@@ -2064,7 +2308,7 @@ void *set_dynamic_options(void *arg) { + msg.body.arg1, msg.body.arg2); + pthread_mutex_lock(&node_info_mutex); + update_nodes(); +- id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2); ++ id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0); + str_from_id_list(buf, BUF_SIZE, node_list_p); + pthread_mutex_unlock(&node_info_mutex); + send_msg(msg.body.src_pid, 'w', 0, 0, buf); +@@ -2134,30 +2378,50 @@ void parse_two_arg_values(char *p, int * + + int main(int argc, char *argv[]) { + int opt; ++ int C_flag = 0; + int d_flag = 0; ++ int H_flag = 0; + int i_flag = 0; + int K_flag = 0; + int l_flag = 0; ++ int m_flag = 0; + int p_flag = 0; + int r_flag = 0; + int S_flag = 0; ++ int t_flag = 0; + int u_flag = 0; + int v_flag = 0; + int w_flag = 0; + int x_flag = 0; ++ int tmp_int = 0; + long list_pid = 0; +- while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) { ++ while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) { + switch (opt) { ++ case 'C': ++ C_flag = 1; ++ use_inactive_file_cache = (atoi(optarg) != 0); ++ break; + case 'd': + d_flag = 1; + log_level = LOG_DEBUG; + break; + case 'D': +- cpuset_dir_list[0] = strdup(optarg); ++ // obsoleted + break; + case 'h': + print_usage_and_exit(argv[0]); + break; ++ case 'H': ++ tmp_int = atoi(optarg); ++ if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) { ++ // 0 means do not change the system default value ++ H_flag = 1; ++ thp_scan_sleep_ms = tmp_int; ++ } else { ++ fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n"); ++ exit(EXIT_FAILURE); ++ } ++ break; + case 'i': + i_flag = 1; + parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0); +@@ -2170,6 +2434,13 @@ int main(int argc, char *argv[]) { + l_flag = 1; + log_level = atoi(optarg); + break; ++ case 'm': ++ tmp_int = atoi(optarg); ++ if ((tmp_int >= 50) && (tmp_int <= 100)) { ++ m_flag = 1; ++ target_memlocality = tmp_int; ++ } ++ break; + case 'p': + p_flag = 1; + list_pid = atol(optarg); +@@ -2183,13 +2454,26 @@ int main(int argc, char *argv[]) { + include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); + break; ++ case 'R': ++ reserved_cpu_str = strdup(optarg); ++ break; + case 'S': + S_flag = 1; + scan_all_processes = (atoi(optarg) != 0); + break; ++ case 't': ++ tmp_int = atoi(optarg); ++ if ((tmp_int >= 0) && (tmp_int <= 100)) { ++ t_flag = 1; ++ htt_percent = tmp_int; ++ } ++ break; + case 'u': +- u_flag = 1; +- target_utilization = atoi(optarg); ++ tmp_int = atoi(optarg); ++ if ((tmp_int >= 10) && (tmp_int <= 130)) { ++ u_flag = 1; ++ target_utilization = tmp_int; ++ } + break; + case 'v': + v_flag = 1; +@@ -2234,6 +2518,12 @@ int main(int argc, char *argv[]) { + // Daemon is already running. So send dynamic options to persistant + // thread to handle requests, get the response (if any), and finish. + msg_t msg; ++ if (C_flag) { ++ send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, ""); ++ } ++ if (H_flag) { ++ send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, ""); ++ } + if (i_flag) { + send_msg(daemon_pid, 'i', min_interval, max_interval, ""); + } +@@ -2243,6 +2533,9 @@ int main(int argc, char *argv[]) { + if (d_flag || l_flag || v_flag) { + send_msg(daemon_pid, 'l', log_level, 0, ""); + } ++ if (m_flag) { ++ send_msg(daemon_pid, 'm', target_memlocality, 0, ""); ++ } + if (p_flag) { + send_msg(daemon_pid, 'p', list_pid, 0, ""); + } +@@ -2252,6 +2545,9 @@ int main(int argc, char *argv[]) { + if (S_flag) { + send_msg(daemon_pid, 'S', scan_all_processes, 0, ""); + } ++ if (t_flag) { ++ send_msg(daemon_pid, 't', htt_percent, 0, ""); ++ } + if (u_flag) { + send_msg(daemon_pid, 'u', target_utilization, 0, ""); + } +@@ -2263,14 +2559,30 @@ int main(int argc, char *argv[]) { + if (x_flag) { + send_msg(daemon_pid, 'x', list_pid, 0, ""); + } +- } else if (w_flag) { +- // Get pre-placement NUMA advice without starting daemon ++ close_log_file(); ++ exit(EXIT_SUCCESS); ++ } ++ // No numad daemon running yet. ++ // First, make note of any reserved CPUs.... ++ if (reserved_cpu_str != NULL) { ++ CLEAR_CPU_LIST(reserved_cpu_mask_list_p); ++ int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str); + char buf[BUF_SIZE]; ++ str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p); ++ numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf); ++ // turn reserved list into a negated mask for later ANDing use... ++ negate_cpu_list(reserved_cpu_mask_list_p); ++ } ++ // If it is a "-w" pre-placement request, handle that without starting ++ // the daemon. Otherwise start the numad daemon. ++ if (w_flag) { ++ // Get pre-placement NUMA advice without starting daemon + update_nodes(); + sleep(2); + update_nodes(); + numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs); +- id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs); ++ id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0); ++ char buf[BUF_SIZE]; + str_from_id_list(buf, BUF_SIZE, node_list_p); + fprintf(stdout, "%s\n", buf); + close_log_file(); +@@ -2278,6 +2590,7 @@ int main(int argc, char *argv[]) { + } else if (max_interval > 0) { + // Start the numad daemon... + check_prereqs(argv[0]); ++#if (!NO_DAEMON) + // Daemonize self... + daemon_pid = fork(); + if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); } +@@ -2298,9 +2611,20 @@ int main(int argc, char *argv[]) { + if (log_fs != stderr) { + fclose(stderr); + } ++#endif ++ // Set up signal handlers ++ struct sigaction sa; ++ memset(&sa, 0, sizeof(sa)); ++ sa.sa_handler = sig_handler; ++ if (sigaction(SIGHUP, &sa, NULL) ++ || sigaction(SIGTERM, &sa, NULL) ++ || sigaction(SIGQUIT, &sa, NULL)) { ++ numad_log(LOG_CRIT, "sigaction does not work?\n"); ++ exit(EXIT_FAILURE); ++ } + // Allocate initial process hash table + process_hash_table_expand(); +- // Spawn thread to handle messages from subsequent invocation requests ++ // Spawn a thread to handle messages from subsequent invocation requests + pthread_mutex_init(&pid_list_mutex, NULL); + pthread_mutex_init(&node_info_mutex, NULL); + pthread_attr_t attr; +@@ -2310,7 +2634,7 @@ int main(int argc, char *argv[]) { + } + pthread_t tid; + if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) { +- numad_log(LOG_CRIT, "pthread_create failure\n"); ++ numad_log(LOG_CRIT, "pthread_create failure: setting thread\n"); + exit(EXIT_FAILURE); + } + // Loop here forwever... +@@ -2322,16 +2646,26 @@ int main(int argc, char *argv[]) { + if (nodes > 1) { + update_processes(); + interval = manage_loads(); ++ if (interval < max_interval) { ++ // Update node info since we moved something ++ nodes = update_nodes(); ++ } + } + sleep(interval); ++ if (got_sigterm | got_sigquit) { ++ shut_down_numad(); ++ } ++ if (got_sighup) { ++ got_sighup = 0; ++ close_log_file(); ++ open_log_file(); ++ } + } + if (pthread_attr_destroy(&attr) != 0) { + numad_log(LOG_WARNING, "pthread_attr_destroy failure\n"); + } + pthread_mutex_destroy(&pid_list_mutex); + pthread_mutex_destroy(&node_info_mutex); +- } else { +- shut_down_numad(); + } + exit(EXIT_SUCCESS); + } +diff -rup numad-0.5git/numad.init numad-0.5git-new/numad.init +--- numad-0.5git/numad.init 2012-12-03 15:40:40.000000000 +0100 ++++ numad-0.5git-new/numad.init 2016-08-30 08:45:19.000000000 +0200 +@@ -37,7 +37,7 @@ start() { + [ -f $config ] || exit 6 + echo -n $"Starting $prog: " + . $config +- daemon "$exec -i $INTERVAL" ++ daemon $exec -i $INTERVAL + retval=$? + echo + [ $retval -eq 0 ] && touch $lockfile diff --git a/SOURCES/numad.logrotate b/SOURCES/numad.logrotate new file mode 100644 index 00000000..9ea1ab4b --- /dev/null +++ b/SOURCES/numad.logrotate @@ -0,0 +1,8 @@ +/var/log/numad.log { + compress + copytruncate + maxage 60 + missingok + rotate 5 + size 1M +} diff --git a/SPECS/numad.spec b/SPECS/numad.spec new file mode 100644 index 00000000..76554827 --- /dev/null +++ b/SPECS/numad.spec @@ -0,0 +1,135 @@ +%global systemctl_bin /usr/bin/systemctl +%global _hardened_build 1 + +Name: numad +Version: 0.5 +Release: 18.20150602git%{?dist} +Summary: NUMA user daemon + +License: LGPLv2 +Group: System Environment/Daemons +URL: http://git.fedorahosted.org/git/?p=numad.git +# The source for this package was pulled from upstream's vcs. Use the +# following commands to generate the tarball: +# git clone git://git.fedorahosted.org/numad.git numad-0.5git +# tar --exclude-vcs -cJf numad-0.5git.tar.xz numad-0.5git/ +Source0: %{name}-%{version}git.tar.xz +Source1: %{name}.logrotate +Patch0: numad-0.5git-pthread.patch +Patch1: numad-0.5git-version.patch +Patch2: numad-0.5git-m-option.patch + +Requires: systemd-units, initscripts +Requires(post): systemd-units, initscripts +Requires(preun): systemd-units, initscripts +BuildRequires: systemd-units + +ExcludeArch: s390 s390x %{arm} + +%description +Numad, a daemon for NUMA (Non-Uniform Memory Architecture) systems, +that monitors NUMA characteristics and manages placement of processes +and memory to minimize memory latency and thus provide optimum performance. + +%prep +%setup -q -n %{name}-%{version}git +%patch0 -p0 +%patch1 -p1 +%patch2 -p1 + +%build +make CFLAGS="$RPM_OPT_FLAGS -std=gnu99" LDFLAGS="$RPM_LD_FLAGS -lpthread -lrt -lm" + +%install +mkdir -p %{buildroot}%{_bindir} +mkdir -p %{buildroot}%{_sysconfdir}/logrotate.d +mkdir -p %{buildroot}%{_unitdir} +mkdir -p %{buildroot}%{_mandir}/man8/ +install -p -m 644 numad.conf %{buildroot}%{_sysconfdir}/ +install -p -m 644 numad.service %{buildroot}%{_unitdir}/ +install -p -m 644 %SOURCE1 %{buildroot}%{_sysconfdir}/logrotate.d/%{name} +make install prefix=%{buildroot}/usr + +%files +%{_bindir}/numad +%{_unitdir}/numad.service +%config(noreplace) %{_sysconfdir}/numad.conf +%config(noreplace) %{_sysconfdir}/logrotate.d/numad +%doc %{_mandir}/man8/numad.8.gz + +%post +%systemd_post numad.service + +%preun +%systemd_preun numad.service + +%postun +%systemd_postun numad.service + +%changelog +* Mon Oct 30 2017 Jan Synáček - 0.5-18.20150602git +- Fix -m option (#1506477) + +* Tue Aug 30 2016 Jan Synáček - 0.5-17.20150602git +- Fix the version patch (#1281711) + +* Mon Jul 11 2016 Jan Synáček - 0.5-16.20150602git +- Version update (#1281711 #1238614 #1235164) + +* Thu May 26 2016 Jan Synáček - 0.5-15.20140620git +- Harden the build (#1092544) + +* Fri Sep 5 2014 Jan Synáček - 0.5-14.20140620git +- Version update +- Resolves: #1112109 + +* Wed Mar 26 2014 Jan Synáček - 0.5-13.20140225git +- Build with $RPM_OPT_FLAGS and $RPM_LD_FLAGS +- Resolves: #1070781 + +* Fri Feb 28 2014 Jan Synáček - 0.5-12.20140225git +- Update source (20140225) and manpage +- Add logrotate config +- Resolves: #853232 + +* Fri Dec 27 2013 Daniel Mach - 0.5-11.20121130git +- Mass rebuild 2013-12-27 + +* Thu Feb 14 2013 Fedora Release Engineering - 0.5-10.20121130git +- Rebuilt for https://fedoraproject.org/wiki/Fedora_19_Mass_Rebuild + +* Tue Dec 11 2012 Jan Synáček - 0.5-9.20121130git +- Update and comment the Makefile patch +- Related: #825153 + +* Mon Dec 03 2012 Jan Synáček - 0.5-8.20121130git +- Update to 20121130 +- Update spec: fix command to generate tarball + +* Tue Oct 16 2012 Jan Synáček - 0.5-7.20121015git +- Update to 20121015 +- Add Makefile patch +- Update spec: update command to generate tarball + +* Wed Aug 22 2012 Jan Synáček - 0.5-6.20120522git +- add systemd-rpm macros +- Resolves: #850236 + +* Fri Jul 20 2012 Fedora Release Engineering - 0.5-5.20120522git +- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild + +* Wed May 23 2012 Jan Synáček - 0.5-4.20120522git +- update source (20120522) and manpage + +* Tue Mar 06 2012 Jan Synáček 0.5-3.20120221git +- update source +- drop the patch + +* Fri Feb 24 2012 Jan Synáček 0.5-2.20120221git +- add BuildRequires: systemd-units + +* Wed Feb 15 2012 Jan Synáček 0.5-1.20120221git +- spec update + +* Fri Feb 10 2012 Bill Burns 0.5-1 +- initial version