diff --git a/SOURCES/numad-0.5git-m-option.patch b/SOURCES/numad-0.5git-m-option.patch
new file mode 100644
index 00000000..8fee75f9
--- /dev/null
+++ b/SOURCES/numad-0.5git-m-option.patch
@@ -0,0 +1,21 @@
+Resolves: #1506477
+
+commit cf6c2c029edc9c288122bcd603a72eb7f6d042d2
+Author: Jan Synacek <jsynacek@redhat.com>
+Date:   Mon Oct 30 11:37:45 2017 +0100
+
+    recognize -m option correctly
+
+diff --git a/numad.c b/numad.c
+index 4c85486..0721af4 100644
+--- a/numad.c
++++ b/numad.c
+@@ -2395,7 +2395,7 @@ int main(int argc, char *argv[]) {
+     int x_flag = 0;
+     int tmp_int = 0;
+     long list_pid = 0;
+-    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
++    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:m:p:r:R:S:t:u:vVw:x:")) != -1) {
+         switch (opt) {
+         case 'C':
+             C_flag = 1;
diff --git a/SOURCES/numad-0.5git-pthread.patch b/SOURCES/numad-0.5git-pthread.patch
new file mode 100644
index 00000000..bb365300
--- /dev/null
+++ b/SOURCES/numad-0.5git-pthread.patch
@@ -0,0 +1,17 @@
+Remove linked libraries from Makefile. They break rebuilds from srpms on some
+architectures. The linker flags are supplied from the spec.
+
+Author: Jan Synacek <jsynacek@redhat.com>
+RH-Bugzilla: #825153
+
+--- Makefile.orig	2012-09-11 08:29:18.965821127 +0200
++++ Makefile	2012-09-11 08:29:29.391803358 +0200
+@@ -31,7 +31,7 @@
+
+ all: numad
+
+-numad: numad.o -lpthread -lrt
++numad: numad.o
+
+ AR ?= ar
+ RANLIB ?= ranlib
diff --git a/SOURCES/numad-0.5git-update-20140225.patch b/SOURCES/numad-0.5git-update-20140225.patch
new file mode 100755
index 00000000..532e776b
--- /dev/null
+++ b/SOURCES/numad-0.5git-update-20140225.patch
@@ -0,0 +1,2255 @@
+--- numad-0.5git/numad.c	2012-12-03 15:40:40.000000000 +0100
++++ new-rhel7/numad.c	2014-02-27 10:02:58.000000000 +0100
+@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
+ */
+
+
+-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
++// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
+
+
+ #define _GNU_SOURCE
+@@ -54,7 +54,7 @@ Inc., 59 Temple Place, Suite 330, Boston
+ #include <values.h>
+
+
+-#define VERSION_STRING "20121130"
++#define VERSION_STRING "20140225"
+
+
+ #define VAR_RUN_FILE "/var/run/numad.pid"
+@@ -86,15 +86,11 @@ char *cpuset_dir_list[] =  {
+ #define MAX_INTERVAL 15
+ #define CPU_THRESHOLD     50
+ #define MEMORY_THRESHOLD 300
++#define THP_SCAN_SLEEP_MS 1000
+ #define TARGET_UTILIZATION_PERCENT 85
+-#define IMPROVEMENT_THRESHOLD_PERCENT 5
++#define DEFAULT_HTT_PERCENT 20
+
+
+-#define ELIM_NEW_LINE(s) \
+-    if (s[strlen(s) - 1] == '\n') { \
+-        s[strlen(s) - 1] = '\0'; \
+-    }
+-
+ #define CONVERT_DIGITS_TO_NUM(p, n) \
+     n = *p++ - '0'; \
+     while (isdigit(*p)) { \
+@@ -105,19 +101,36 @@ char *cpuset_dir_list[] =  {
+
+ int num_cpus = 0;
+ int num_nodes = 0;
++int threads_per_core = 0;
+ int page_size_in_bytes = 0;
+ int huge_page_size_in_bytes = 0;
++int thp_scan_sleep_ms = THP_SCAN_SLEEP_MS;
+
+ int min_interval = MIN_INTERVAL;
+ int max_interval = MAX_INTERVAL;
++int htt_percent = DEFAULT_HTT_PERCENT;
+ int target_utilization  = TARGET_UTILIZATION_PERCENT;
+ int scan_all_processes = 1;
+ int keep_interleaved_memory = 0;
++int use_inactive_file_cache = 1;
+
+ pthread_mutex_t pid_list_mutex;
+ pthread_mutex_t node_info_mutex;
++long sum_CPUs_total = 0;
+ int requested_mbs = 0;
+ int requested_cpus = 0;
++int got_sighup = 0;
++int got_sigterm = 0;
++int got_sigquit = 0;
++
++
++void sig_handler(int signum) {
++    switch (signum) {
++        case SIGHUP:  got_sighup  = 1; break;
++        case SIGTERM: got_sigterm = 1; break;
++        case SIGQUIT: got_sigquit = 1; break;
++    }
++}
+
+
+
+@@ -161,7 +174,9 @@ void open_log_file() {
+
+ void close_log_file() {
+     if (log_fs != NULL) {
+-        fclose(log_fs);
++        if (log_fs != stderr) {
++            fclose(log_fs);
++        }
+         log_fs = NULL;
+     }
+ }
+@@ -233,7 +248,6 @@ void send_msg(long dst_pid, long cmd, lo
+ }
+
+
+-
+ typedef struct id_list {
+     // Use CPU_SET(3) <sched.h> cpuset bitmasks,
+     // but bundle size and pointer together
+@@ -242,16 +256,22 @@ typedef struct id_list {
+     size_t bytes;
+ } id_list_t, *id_list_p;
+
+-#define INIT_ID_LIST(list_p) \
++#define INIT_ID_LIST(list_p, num_elements) \
+     list_p = malloc(sizeof(id_list_t)); \
+     if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
+-    list_p->set_p = CPU_ALLOC(num_cpus); \
++    list_p->set_p = CPU_ALLOC(num_elements); \
+     if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
+-    list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
++    list_p->bytes = CPU_ALLOC_SIZE(num_elements);
++
++#define CLEAR_CPU_LIST(list_p) \
++    if (list_p == NULL) { \
++        INIT_ID_LIST(list_p, num_cpus); \
++    } \
++    CPU_ZERO_S(list_p->bytes, list_p->set_p)
+
+-#define CLEAR_LIST(list_p) \
++#define CLEAR_NODE_LIST(list_p) \
+     if (list_p == NULL) { \
+-        INIT_ID_LIST(list_p); \
++        INIT_ID_LIST(list_p, num_nodes); \
+     } \
+     CPU_ZERO_S(list_p->bytes, list_p->set_p)
+
+@@ -262,6 +282,9 @@ typedef struct id_list {
+         list_p = NULL; \
+     }
+
++#define COPY_LIST(orig_list_p, copy_list_p) \
++    memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
++
+ #define NUM_IDS_IN_LIST(list_p)     CPU_COUNT_S(list_p->bytes, list_p->set_p)
+ #define ADD_ID_TO_LIST(k, list_p)  CPU_SET_S(k, list_p->bytes, list_p->set_p)
+ #define CLR_ID_IN_LIST(k, list_p)  CPU_CLR_S(k, list_p->bytes, list_p->set_p)
+@@ -272,6 +295,25 @@ typedef struct id_list {
+ #define  OR_LISTS( or_list_p, list_1_p, list_2_p)  CPU_OR_S( or_list_p->bytes,  or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
+ #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
+
++int negate_list(id_list_p list_p) {
++    if (list_p == NULL) {
++        numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
++        exit(EXIT_FAILURE);
++    }
++    if (num_cpus < 1) {
++        numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
++        exit(EXIT_FAILURE);
++    }
++    for (int ix = 0;  (ix < num_cpus);  ix++) {
++        if (ID_IS_IN_LIST(ix, list_p)) {
++            CLR_ID_IN_LIST(ix, list_p);
++        } else {
++            ADD_ID_TO_LIST(ix, list_p);
++        }
++    }
++    return NUM_IDS_IN_LIST(list_p);
++}
++
+ int add_ids_to_list_from_str(id_list_p list_p, char *s) {
+     if (list_p == NULL) {
+         numad_log(LOG_CRIT, "Cannot add to NULL list\n");
+@@ -340,6 +382,25 @@ terminate_string:
+     return (p - str_p);
+ }
+
++id_list_p all_cpus_list_p = NULL;
++id_list_p all_nodes_list_p = NULL;
++char *reserved_cpu_str = NULL;
++id_list_p reserved_cpu_mask_list_p = NULL;
++uint64_t node_info_time_stamp = 0;
++
++
++int read_one_line(char *buf, int buf_size, char *fname) {
++    int fd = open(fname, O_RDONLY, 0);
++    if (fd < 0) {
++        return fd;
++    }
++    int bytes = read(fd, buf, buf_size);
++    if (buf[bytes - 1] == '\n') {
++        buf[bytes - 1] = '\0';
++    }
++    close(fd);
++    return bytes;
++}
+
+
+ typedef struct node_data {
+@@ -355,6 +416,16 @@ typedef struct node_data {
+
+ node_data_p node = NULL;
+
++uint64_t min_node_CPUs_free = MAXINT;
++uint64_t min_node_MBs_free = MAXINT;
++uint64_t max_node_CPUs_free = 0;
++uint64_t max_node_MBs_free = 0;
++uint64_t avg_node_CPUs_free = 0;
++uint64_t avg_node_MBs_free = 0;
++double stddev_node_CPUs_free = 0.0;
++double stddev_node_MBs_free = 0.0;
++
++
+ // RING_BUF_SIZE must be a power of two
+ #define RING_BUF_SIZE 8
+
+@@ -366,14 +437,14 @@ typedef struct process_data {
+     uint64_t data_time_stamp; // hundredths of seconds
+     uint64_t bind_time_stamp;
+     uint64_t num_threads;
++    uint64_t MBs_size;
+     uint64_t MBs_used;
+     uint64_t cpu_util;
+     uint64_t CPUs_used;  // scaled * ONE_HUNDRED
+     uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
+     int ring_buf_ix;
+-    int dup_bind_count;
+     char *comm;
+-    char *cpuset_name;
++    id_list_p node_list_p;
+ } process_data_t, *process_data_p;
+
+
+@@ -454,12 +525,15 @@ int process_hash_update(process_data_p n
+             }
+             p->CPUs_used = max_CPUs_used;
+         }
++// FIXME: seems like this comm check should not be necessary every update
++// But it does happen only for candidates that cross the memory threshold...
+         if ((!p->comm) || (strcmp(p->comm, newp->comm))) {
+             if (p->comm) {
+                 free(p->comm);
+             }
+             p->comm = strdup(newp->comm);
+         }
++        p->MBs_size = newp->MBs_size;
+         p->MBs_used = newp->MBs_used;
+         p->cpu_util = newp->cpu_util;
+         p->num_threads = newp->num_threads;
+@@ -468,6 +542,11 @@ int process_hash_update(process_data_p n
+     return new_hash_table_entry;
+ }
+
++void process_hash_clear_all_bind_time_stamps() {
++    for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
++        process_hash_table[ix].bind_time_stamp = 0;
++    }
++}
+
+ int process_hash_rehash(int old_ix) {
+     // Given the index of a table entry that would otherwise be orphaned by
+@@ -489,7 +568,7 @@ int process_hash_remove(int pid) {
+         // remove the target
+         process_data_p dp = &process_hash_table[ix];
+         if (dp->comm) { free(dp->comm); }
+-        if (dp->cpuset_name) { free(dp->cpuset_name); }
++        FREE_LIST(dp->node_list_p);
+         memset(dp, 0, sizeof(process_data_t));
+         // bubble up the collision chain and rehash if neeeded
+         for (;;) {
+@@ -543,15 +622,29 @@ void process_hash_table_dump() {
+         process_data_p p = &process_hash_table[ix];
+         if (p->pid) {
+             numad_log(LOG_DEBUG,
+-                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld Data TS: %ld  Bind TS: %ld\n",
++                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld/%ld Data TS: %ld  Bind TS: %ld\n",
+                 ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
+-                p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
++                p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
++            // FIXME: make this dump every field
++        }
++    }
++}
++
++void remove_obsolete_cpuset_if_no_tasks(int pid) {
++    // PID parameter has already been checked via kill(0) and seems dead
++    char buf[BUF_SIZE];
++    char fname[FNAME_SIZE];
++    snprintf(fname, FNAME_SIZE, "%s/numad.%d/tasks", cpuset_dir, pid);
++    if ((access(fname, F_OK) == 0) && (read_one_line(buf, BUF_SIZE, fname) <= 1)) {
++        snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
++        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
++        if (rmdir(fname) < 0) {
++            numad_log(LOG_ERR, "bad cpuset rmdir\n");
+         }
+     }
+ }
+
+ void process_hash_table_cleanup(uint64_t update_time) {
+-    int cpusets_removed = 0;
+     int num_hash_entries_used = 0;
+     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+         process_data_p p = &process_hash_table[ix];
+@@ -562,40 +655,56 @@ void process_hash_table_cleanup(uint64_t
+                 p->data_time_stamp = 0;
+                 p->CPUs_used = 0;
+                 // Check for dead pids and remove them...
+-                char fname[FNAME_SIZE];
+-                snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
+-                if (access(fname, F_OK) < 0) {
++                if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
+                     // Seems dead.  Forget this pid -- after first checking
+                     // and removing obsolete numad.PID cpuset directories.
+-                    snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
+-                    if (access(fname, F_OK) == 0) {
+-                        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
+-                        int rc = rmdir(fname);
+-                        if (rc >= 0) {
+-                            cpusets_removed += 1;
+-                        } else {
+-                            numad_log(LOG_ERR, "bad cpuset rmdir\n");
+-                            // exit(EXIT_FAILURE);
+-                        }
+-                    }
++                    remove_obsolete_cpuset_if_no_tasks(p->pid);
+                     process_hash_remove(p->pid);
+                     num_hash_entries_used -= 1;
+                 }
+             }
+         }
+     }
+-    if (cpusets_removed > 0) {
+-        // Expire all the duplicate bind counts so things will be re-evaluated sooner.
+-        for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+-            process_hash_table[ix].dup_bind_count = 0;
+-        }
+-    }
+     // Keep hash table approximately half empty
+     if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
+         process_hash_table_expand();
+     }
+ }
+
++static int name_starts_with_numad(const struct dirent *dptr) {
++    return (strncmp(dptr->d_name, "numad.", 6) == 0);
++}
++
++void *clean_obsolete_cpusets(void *arg) {
++    // int arg_value = *(int *)arg;
++    for (;;) {
++        // Loop here forever (slowly) cleaning obsolete cpusets
++        sleep(571);  // Arbitrary number a little less than ten minutes
++        struct dirent **namelist;
++        int files = scandir(cpuset_dir, &namelist, name_starts_with_numad, NULL);
++        if (files < 0) {
++            numad_log(LOG_ERR, "Troubled scanning for obsolete cpusets\n");
++            continue;
++        }
++        for (int ix = 0;  (ix < files);  ix++) {
++            char *p = &(namelist[ix]->d_name[6]);
++            if (isdigit(*p)) {
++                int pid;
++                CONVERT_DIGITS_TO_NUM(p, pid);
++                // If it seems like a valid PID -- that is NOT in the hash
++                // table -- and the process appears to be dead, then try to
++                // delete the cpuset directory.  (Dead PIDs we know about in
++                // the hash table will be cleaned separately.)
++                if ((pid > 10) && (process_hash_lookup(pid) < 0)
++                    && (kill(pid, 0) == -1) && (errno == ESRCH)) {
++                    remove_obsolete_cpuset_if_no_tasks(pid);
++                }
++            }
++            free(namelist[ix]);
++        }
++        free(namelist);
++    }
++}
+
+
+ typedef struct pid_list {
+@@ -610,9 +719,7 @@ pid_list_p insert_pid_into_pid_list(pid_
+     if (process_hash_table != NULL) {
+         int hash_ix = process_hash_lookup(pid);
+         if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
+-            // Clear dup_bind_count and interleaved flag,
+-            // in case user wants it to be re-evaluated soon
+-            process_hash_table[hash_ix].dup_bind_count = 0;
++            // Clear interleaved flag, in case user wants it to be re-evaluated
+             process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
+         }
+     }
+@@ -678,17 +785,22 @@ void print_version_and_exit(char *prog_n
+
+ void print_usage_and_exit(char *prog_name) {
+     fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
++    fprintf(stderr, "-C 1  to count inactive file cache as available memory (default 1)\n");
++    fprintf(stderr, "-C 0  to count inactive file cache memory as unavailable (default 1)\n");
+     fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
+     fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
+     fprintf(stderr, "-h to print this usage info\n");
++    fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default 1000)\n");
+     fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
+-    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes\n");
+-    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes\n");
+-    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
++    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes (default 0)\n");
++    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes (default 0)\n");
++    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
+     fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
+     fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
+-    fprintf(stderr, "-S 1  to scan all processes\n");
+-    fprintf(stderr, "-S 0  to scan only explicit PID list processes\n");
++    fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
++    fprintf(stderr, "-S 1  to scan all processes (default 1)\n");
++    fprintf(stderr, "-S 0  to scan only explicit PID list processes (default 1)\n");
++    fprintf(stderr, "-t <N> to specify thread / logical CPU percent (default 20)\n");
+     fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
+     fprintf(stderr, "-v for verbose  (same effect as '-l 6')\n");
+     fprintf(stderr, "-V to show version info\n");
+@@ -698,6 +810,32 @@ void print_usage_and_exit(char *prog_nam
+ }
+
+
++void set_thp_scan_sleep_ms(int new_ms) {
++    if (new_ms < 1) {
++        // 0 means do not change the system default
++        return;
++    }
++    char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
++    int fd = open(thp_scan_fname, O_RDWR, 0);
++    if (fd >= 0) {
++        char buf[BUF_SIZE];
++        int bytes = read(fd, buf, BUF_SIZE);
++        if (bytes > 0) {
++            int cur_ms;
++            char *p = buf;
++            CONVERT_DIGITS_TO_NUM(p, cur_ms);
++            if (cur_ms != new_ms) {
++                lseek(fd, 0, SEEK_SET);
++                numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
++                sprintf(buf, "%d\n", new_ms);
++                write(fd, buf, strlen(buf));
++            }
++        }
++        close(fd);
++    }
++}
++
++
+ void check_prereqs(char *prog_name) {
+     // Verify cpusets are available on this system.
+     char **dir = &cpuset_dir_list[0];
+@@ -730,30 +868,8 @@ void check_prereqs(char *prog_name) {
+         fprintf(stderr, "\n");
+         exit(EXIT_FAILURE);
+     }
+-    // Check on THP scan sleep time.
+-    char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
+-    int fd = open(thp_scan_fname, O_RDONLY, 0);
+-    if (fd >= 0) {
+-        int ms;
+-        char buf[BUF_SIZE];
+-        int bytes = read(fd, buf, BUF_SIZE);
+-        close(fd);
+-        if (bytes > 0) {
+-            char *p = buf;
+-            CONVERT_DIGITS_TO_NUM(p, ms);
+-            if (ms > 150) {
+-                fprintf(stderr, "\n");
+-                numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
+-                fprintf(stderr,       "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
+-                fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
+-                fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
+-                fprintf(stderr, "to more aggressively (re)construct THPs.  For example:\n");
+-                fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
+-                fprintf(stderr, "\n");
+-            }
+-        }
+-    }
+-    // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
++    // Adjust kernel tunable to scan for THP more frequently...
++    set_thp_scan_sleep_ms(thp_scan_sleep_ms);
+ }
+
+
+@@ -831,6 +947,43 @@ fail_numad_run_file:
+ }
+
+
++int count_set_bits_in_hex_list_file(char *fname) {
++    int sum = 0;
++    int fd = open(fname, O_RDONLY, 0);
++    if (fd >= 0) {
++        char buf[BUF_SIZE];
++        int bytes = read(fd, buf, BUF_SIZE);
++        close(fd);
++        for (int ix = 0;  (ix < bytes);  ix++) {
++            char c = tolower(buf[ix]);
++            switch (c) {
++                case '0'  : sum += 0; break;
++                case '1'  : sum += 1; break;
++                case '2'  : sum += 1; break;
++                case '3'  : sum += 2; break;
++                case '4'  : sum += 1; break;
++                case '5'  : sum += 2; break;
++                case '6'  : sum += 2; break;
++                case '7'  : sum += 3; break;
++                case '8'  : sum += 1; break;
++                case '9'  : sum += 2; break;
++                case 'a'  : sum += 2; break;
++                case 'b'  : sum += 3; break;
++                case 'c'  : sum += 2; break;
++                case 'd'  : sum += 3; break;
++                case 'e'  : sum += 3; break;
++                case 'f'  : sum += 4; break;
++                case ' '  : sum += 0; break;
++                case ','  : sum += 0; break;
++                case '\n' : sum += 0; break;
++                default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
++            }
++        }
++    }
++    return sum;
++}
++
++
+ int get_num_cpus() {
+     int n1 = sysconf(_SC_NPROCESSORS_CONF);
+     int n2 = sysconf(_SC_NPROCESSORS_ONLN);
+@@ -916,129 +1069,244 @@ static int name_starts_with_digit(const
+ }
+
+
+-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
+-    // Check basic parameter validity.
+-    if (pid <= 0) {
++int write_to_cpuset_file(char *fname, char *s) {
++    int fd = open(fname, O_WRONLY | O_TRUNC, 0);
++    if (fd == -1) {
++        numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno);
++        return -1;
++    }
++    numad_log(LOG_DEBUG, "Writing %s to: %s\n", s, fname);
++    if (write(fd, s, strlen(s)) <= 0) {
++        numad_log(LOG_CRIT, "Could not write %s to %s -- errno: %d\n", s, fname, errno);
++        return -1;
++    }
++    close(fd);
++    return 0;
++}
++
++int configure_cpuset(char *cpuset_name, char *node_list_str, char *cpu_list_str) {
++    int rc = 0;
++    char fname[FNAME_SIZE];
++    // Write "1" out to cpuset.memory_migrate file
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
++    rc += write_to_cpuset_file(fname, "1");
++    // For memory binding, write node IDs out to cpuset.mems file
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
++    rc += write_to_cpuset_file(fname, node_list_str);
++    // For CPU binding, write CPU IDs out to cpuset.cpus file
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
++    rc += write_to_cpuset_file(fname, cpu_list_str);
++    return rc;
++}
++
++int bind_process_and_migrate_memory(process_data_p p) {
++    char buf[BUF_SIZE];
++    char fname[FNAME_SIZE];
++    char pid_cpuset_name[FNAME_SIZE];
++    uint64_t t0 = get_time_stamp();
++    // Parameter p is a pointer to an element in the hash table
++    if ((!p) || (p->pid < 1)) {
+         numad_log(LOG_CRIT, "Bad PID to bind\n");
+         exit(EXIT_FAILURE);
+     }
+-    if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
+-        numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
++    if (!p->node_list_p) {
++        numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
+         exit(EXIT_FAILURE);
+     }
+-    int nodes;
+-    if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
+-        numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
+-        exit(EXIT_FAILURE);
++    // Get cpuset name for this PID, or make a new cpuset if necessary
++    snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", p->pid);
++    if (read_one_line(buf, BUF_SIZE, fname) <= 0) {
++        numad_log(LOG_WARNING, "Could not get cpuset of PID %d.\n", p->pid);
++        return 0;  // Assume the process terminated
+     }
+-    // Cpu_list_p is optional and may be NULL...
+-    // Generate CPU id list from the specified node list if necessary
+-    if (cpu_list_p == NULL) {
+-        static id_list_p tmp_cpu_list_p;
+-        CLEAR_LIST(tmp_cpu_list_p);
+-        int node_id = 0;
+-        while (nodes) {
+-            if (ID_IS_IN_LIST(node_id, node_list_p)) {
+-                OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
+-                nodes -= 1;
+-            }
+-            node_id += 1;
+-        }
+-        cpu_list_p = tmp_cpu_list_p;
+-    }
+-    // Make the cpuset directory if necessary
+-    char cpuset_name_buf[FNAME_SIZE];
+-    snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
+-    char *p = &cpuset_name_buf[strlen(cpuset_dir)];
+-    if (!strcmp(p, "/")) {
+-        // Make a cpuset directory for this process
+-        snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
+-        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
+-        int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+-        if (rc == -1) {
++    if (!strcmp(buf, "/")) {
++        // Default cpuset name, so make a new cpuset directory for this PID
++        snprintf(pid_cpuset_name, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
++        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", pid_cpuset_name);
++        if (mkdir(pid_cpuset_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) < 0) {
+             numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
+             return 0;
+         }
++        // Temporarily enable all CPUs for a new cpuset...
++        char all_cpus_list_buf[BUF_SIZE];
++        str_from_id_list(all_cpus_list_buf, BUF_SIZE, all_cpus_list_p);
++        // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID
++        snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name);
++        if (write_to_cpuset_file(fname, all_cpus_list_buf) < 0) {
++            numad_log(LOG_CRIT, "Could not configure cpuset.cpus: %s\n", pid_cpuset_name);
++            return 0;  // Assume the process terminated
++        }
++    } else {
++        // Save the existing nondefault cpuset name for this PID
++        snprintf(pid_cpuset_name, FNAME_SIZE, "%s%s", cpuset_dir, buf);
+     }
+-    cpuset_name = cpuset_name_buf;
+-    // Now that we have a cpuset for pid and a populated cpulist,
+-    // start the actual binding and migration.
+-    uint64_t t0 = get_time_stamp();
+-
++    // Configure the main PID cpuset with desired nodes and memory migrate
++    // flag.  Defer the CPU binding for the main PID until after the PID is
++    // actually written to the task file and the memory has been moved.
++    char node_list_buf[BUF_SIZE];
++    str_from_id_list(node_list_buf, BUF_SIZE, p->node_list_p);
+     // Write "1" out to cpuset.memory_migrate file
+-    char fname[FNAME_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", pid_cpuset_name);
++    if (write_to_cpuset_file(fname, "1") < 0) {
++        numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
++        return 0;  // Assume the process terminated
++    }
++    // For memory binding, write node IDs out to cpuset.mems file
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", pid_cpuset_name);
++    if (write_to_cpuset_file(fname, node_list_buf) < 0) {
++        numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
++        return 0;  // Assume the process terminated
++    }
++    // Open the main PID cpuset tasks file and
++    // bind the main PID in the main cpuset now.
++    snprintf(fname, FNAME_SIZE, "%s/tasks", pid_cpuset_name);
+     int fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
+-        return 0;
++    if (fd < 0) {
++        numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno);
++        return 0;  // Assume the process terminated
+     }
+-    write(fd, "1", 1);
+-    close(fd);
+-
+-    // Write node IDs out to cpuset.mems file
+-    char node_list_buf[BUF_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
+-        return 0;
++    numad_log(LOG_NOTICE, "Including PID: %d in cpuset: %s\n", p->pid, pid_cpuset_name);
++    char pid_str[FNAME_SIZE];
++    snprintf(pid_str, FNAME_SIZE, "%d", p->pid);
++    if (write(fd, pid_str, strlen(pid_str)) <= 0) {
++        numad_log(LOG_CRIT, "Could not write %s to cpuset: %s -- errno: %d\n", pid_str, pid_cpuset_name, errno);
++        close(fd);
++        return 0;  // Assume the process terminated
+     }
+-    int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
+-    write(fd, node_list_buf, len);
+-    close(fd);
+-
+-    // Write CPU IDs out to cpuset.cpus file
+-    char cpu_list_buf[BUF_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
+-        return 0;
++    // Generate CPU binding list derived from node bind list.
++    static id_list_p cpu_bind_list_p;
++    CLEAR_CPU_LIST(cpu_bind_list_p);
++    int nodes = NUM_IDS_IN_LIST(p->node_list_p);
++    int node_id = 0;
++    while (nodes) {
++        if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
++            OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
++            nodes -= 1;
++        }
++        node_id += 1;
+     }
+-    len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
+-    write(fd, cpu_list_buf, len);
+-    close(fd);
+-
+-    // Copy pid tasks one at a time to tasks file
+-    snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
+-        return 0;
++    char cpu_bind_list_buf[BUF_SIZE];
++    str_from_id_list(cpu_bind_list_buf, BUF_SIZE, cpu_bind_list_p);
++    // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID
++    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name);
++    if (write_to_cpuset_file(fname, cpu_bind_list_buf) < 0) {
++        numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
++        return 0;  // Assume the process terminated
+     }
+-    snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
++    // Leave fd open in case process is multithreaded and we need to write more
++    // (sub) task IDs there.  In case multithreaded, make sure all the subtasks
++    // for this PID are in a cpuset.  If not already in cpuset, put them in the
++    // main cpuset.  Start by getting the name list of all tasks for this PID.
+     struct dirent **namelist;
+-    int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
+-    if (files < 0) {
+-        numad_log(LOG_WARNING, "Could not scandir task list\n");
++    snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
++    int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
++    if (num_tasks <= 0) {
++        numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
++        close(fd);
+         return 0;  // Assume the process terminated
+     }
+-    for (int ix = 0;  (ix < files);  ix++) {
+-        // copy pid tasks, one at a time
+-        numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
+-        write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
+-        free(namelist[ix]);
++    if (num_tasks == 1) {
++        // This is the normal nonthreaded case.  No sub tasks -- only the
++        // single main PID task, which is already bound above...
++        free(namelist[0]);
++    } else {
++        // Multithreaded so check all of the multiple subtasks. Avoid redundant
++        // subtask cpuset configuration by keeping a list of unique cpusets as
++        // we check each subtask.  If the subtasks have only default cpuset
++        // names, bind those subtasks into the main cpuset with the main PID
++        // instead of adding them to the list.  (cpuset_list is static so we
++        // can reuse the allocated array of pointers.)
++        int num_names = 0;
++        static char **cpuset_list;
++        static int cpuset_list_size;
++        for (int ix = 0;  (ix < num_tasks);  ix++) {
++            // Check the cpuset name for each task
++            if (!strcmp(namelist[ix]->d_name, pid_str)) {
++                // This is the main PID task, which is already bound above. Skip it here.
++                free(namelist[ix]);
++                continue;
++            }
++            snprintf(fname, FNAME_SIZE, "/proc/%d/task/%s/cpuset", p->pid, namelist[ix]->d_name);
++            if (read_one_line(buf, BUF_SIZE, fname) <= 0) {
++                numad_log(LOG_WARNING, "Could not open %s. Assuming thread completed.\n", fname);
++                free(namelist[ix]);
++                continue;
++            }
++            if (strcmp(buf, "/")) {
++                // Subtask already has a nondefault cpuset name.  Add this
++                // subtask cpuset name to the list of unique cpuset names.  Do
++                // sequential search comparisons first to verify uniqueness.
++                snprintf(fname, FNAME_SIZE, "%s%s", cpuset_dir, buf);
++                int iy = 0;
++                while (iy < num_names) {
++                    if (!strcmp(fname, cpuset_list[iy])) {
++                        break;  // because we already have this cpuset name in the list
++                    }
++                    iy += 1;
++                }
++                if (iy == num_names) {
++                    // We got to the end of the cpulist, so this is a new cpuset name not yet in the list
++                    if (num_names == cpuset_list_size) {
++                        if (cpuset_list_size == 0) {
++                            cpuset_list_size = 10;
++                        } else {
++                            cpuset_list_size *= 2;
++                        }
++                        cpuset_list = realloc(cpuset_list, (cpuset_list_size * sizeof(char *)));
++                        if (cpuset_list == NULL) {
++                            numad_log(LOG_CRIT, "realloc failed\n");
++                            exit(EXIT_FAILURE);
++                        }
++                    }
++                    // Configure this subtask cpuset and, if successful, save a
++                    // copy of the name in the unique cpuset list.
++                    if (configure_cpuset(fname, node_list_buf, cpu_bind_list_buf) < 0) {
++                        numad_log(LOG_WARNING, "Could not configure cpuset %s. Assuming thread completed.\n", fname);
++                        free(namelist[ix]);
++                        continue;
++                    } else {
++                        cpuset_list[num_names++] = strdup(fname);
++                    }
++                }
++            } else {
++                // This task ID has the default cpuset name.  Just add this task ID to the main PID cpuset.
++                numad_log(LOG_NOTICE, "Including task: %s in cpuset: %s\n", namelist[ix]->d_name, pid_cpuset_name);
++                if (write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)) <= 0) {
++                    numad_log(LOG_WARNING, "Could not write to cpuset: %s -- errno: %d\n", pid_cpuset_name, errno);
++                    free(namelist[ix]);
++                    continue;  // Assuming thread completed.
++                }
++            }
++            free(namelist[ix]);
++        }
++        // Done with subtask unique cpuset names for this PID.  Free them.
++        for (int ix = 0;  (ix < num_names);  ix++) {
++            free(cpuset_list[ix]);
++        }
+     }
+     free(namelist);
+     close(fd);
+-
+-    uint64_t t1 = get_time_stamp();
+     // Check pid still active
+-    snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
++    snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
+     if (access(fname, F_OK) < 0) {
+-        numad_log(LOG_WARNING, "Could not migrate pid\n");
+-        return 0;  // Assume the process terminated
++        numad_log(LOG_WARNING, "Could not migrate pid %d\n", p->pid);
++        return 0;
++    } else {
++        uint64_t t1 = get_time_stamp();
++        p->bind_time_stamp = t1;
++        numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
++        return 1;
+     }
+-    numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
+-    return 1;
+ }
+
+
+ void show_nodes() {
+-    time_t ts = time(NULL);
+-    fprintf(log_fs, "%s", ctime(&ts));
+-    fprintf(log_fs, "Nodes: %d\n", num_nodes);
++    fprintf(log_fs, "\n");
++    numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
++    fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n",
++        min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
++    fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n",
++        min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
+     for (int ix = 0;  (ix < num_nodes);  ix++) {
+         fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ",
+             ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
+@@ -1049,7 +1317,6 @@ void show_nodes() {
+         str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
+         fprintf(log_fs, " CPUs: %s\n", buf);
+     }
+-    fprintf(log_fs, "\n");
+     fflush(log_fs);
+ }
+
+@@ -1065,7 +1332,7 @@ int cur_cpu_data_buf = 0;
+
+ void update_cpu_data() {
+     // Parse idle percents from CPU stats in /proc/stat cpu<N> lines
+-    static FILE *fs = NULL;
++    static FILE *fs;
+     if (fs != NULL) {
+         rewind(fs);
+     } else {
+@@ -1107,7 +1374,8 @@ void update_cpu_data() {
+             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip nice
+             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip system
+             while (!isdigit(*p)) { p++; }
+-            uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
++            uint64_t idle;
++            CONVERT_DIGITS_TO_NUM(p, idle);
+             cpu_data_buf[new].idle[cpu_id] = idle;
+         }
+     }
+@@ -1129,10 +1397,6 @@ int node_and_digits(const struct dirent
+ }
+
+
+-id_list_p all_cpus_list_p = NULL;
+-id_list_p all_nodes_list_p = NULL;
+-uint64_t node_info_time_stamp = 0;
+-
+
+ int update_nodes() {
+     char fname[FNAME_SIZE];
+@@ -1141,6 +1405,7 @@ int update_nodes() {
+     uint64_t time_stamp = get_time_stamp();
+ #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
+     if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
++        node_info_time_stamp = time_stamp;
+         // Count directory names of the form: /sys/devices/system/node/node<N>
+         struct dirent **namelist;
+         int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
+@@ -1167,8 +1432,15 @@ int update_nodes() {
+             }
+             num_nodes = num_files;
+         }
+-        CLEAR_LIST(all_cpus_list_p);
+-        CLEAR_LIST(all_nodes_list_p);
++        sum_CPUs_total = 0;
++        CLEAR_CPU_LIST(all_cpus_list_p);
++        CLEAR_NODE_LIST(all_nodes_list_p);
++        // Figure out how many threads per core there are (for later discounting of hyper-threads)
++        threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
++        if (threads_per_core < 1) {
++            numad_log(LOG_CRIT, "Could not count threads per core\n");
++            exit(EXIT_FAILURE);
++        }
+         // For each "node<N>" filename present, save <N> in node[ix].node_id
+         // Note that the node id might not necessarily match the node ix.
+         // Also populate the cpu lists and distance vectors for this node.
+@@ -1185,10 +1457,22 @@ int update_nodes() {
+             int fd = open(fname, O_RDONLY, 0);
+             if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
+                 // get cpulist from the cpulist string
+-                CLEAR_LIST(node[node_ix].cpu_list_p);
++                CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
+                 int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
++                if (reserved_cpu_str != NULL) {
++                    AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
++                    n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
++                }
+                 OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
+-                node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                // Calculate total CPUs, but possibly discount hyper-threads
++                if ((threads_per_core == 1) || (htt_percent >= 100)) {
++                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                } else {
++                    n /= threads_per_core;
++                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                    node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
++                }
++                sum_CPUs_total += node[node_ix].CPUs_total;
+                 close(fd);
+             } else {
+                 numad_log(LOG_CRIT, "Could not get node cpu list\n");
+@@ -1220,14 +1504,28 @@ int update_nodes() {
+         }
+         free(namelist);
+     }
+-    // Second, get the dynamic free memory and available CPU capacity
++    // Second, update the dynamic free memory and available CPU capacity
++    while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
++        // Make sure at least 7/100 of a second has passed.
++        // Otherwise sleep for 1/10 second.
++	struct timespec ts = { 0, 100000000 };
++	nanosleep(&ts, &ts);
++	time_stamp = get_time_stamp();
++    }
+     update_cpu_data();
++    max_node_MBs_free = 0;
++    max_node_CPUs_free = 0;
++    min_node_MBs_free = MAXINT;
++    min_node_CPUs_free = MAXINT;
++    uint64_t sum_of_node_MBs_free = 0;
++    uint64_t sum_of_node_CPUs_free = 0;
+     for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
+         int node_id = node[node_ix].node_id;
+         // Get available memory info from node<N>/meminfo file
+         snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
+         int fd = open(fname, O_RDONLY, 0);
+         if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
++            close(fd);
+             uint64_t KB;
+             char *p = strstr(buf, "MemTotal:");
+             if (p != NULL) {
+@@ -1238,7 +1536,7 @@ int update_nodes() {
+             }
+             while (!isdigit(*p)) { p++; }
+             CONVERT_DIGITS_TO_NUM(p, KB);
+-            node[node_ix].MBs_total = KB / KILOBYTE;
++            node[node_ix].MBs_total = (KB / KILOBYTE);
+             p = strstr(p, "MemFree:");
+             if (p != NULL) {
+                 p += 8;
+@@ -1248,8 +1546,27 @@ int update_nodes() {
+             }
+             while (!isdigit(*p)) { p++; }
+             CONVERT_DIGITS_TO_NUM(p, KB);
+-            node[node_ix].MBs_free = KB / KILOBYTE;
+-            close(fd);
++            node[node_ix].MBs_free = (KB / KILOBYTE);
++            if (use_inactive_file_cache) {
++                // Add inactive file cache quantity to "free" memory
++                p = strstr(p, "Inactive(file):");
++                if (p != NULL) {
++                    p += 15;
++                } else {
++                    numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
++                    exit(EXIT_FAILURE);
++                }
++                while (!isdigit(*p)) { p++; }
++                CONVERT_DIGITS_TO_NUM(p, KB);
++                node[node_ix].MBs_free += (KB / KILOBYTE);
++            }
++            sum_of_node_MBs_free += node[node_ix].MBs_free;
++            if (min_node_MBs_free > node[node_ix].MBs_free) {
++                min_node_MBs_free = node[node_ix].MBs_free;
++            }
++            if (max_node_MBs_free < node[node_ix].MBs_free) {
++                max_node_MBs_free = node[node_ix].MBs_free;
++            }
+         } else {
+             numad_log(LOG_CRIT, "Could not get node meminfo\n");
+             exit(EXIT_FAILURE);
+@@ -1260,7 +1577,8 @@ int update_nodes() {
+         if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
+             uint64_t idle_ticks = 0;
+             int cpu = 0;
+-            int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
++            int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
++            int num_cpus_to_process = num_lcpus;
+             while (num_cpus_to_process) {
+                 if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
+                     idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
+@@ -1274,15 +1592,45 @@ int update_nodes() {
+             // printf("Node: %d   CPUs: %ld   time diff %ld   Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
+             // assert(time_diff > 0);
+             node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
++            // Possibly discount hyper-threads
++            if ((threads_per_core > 1) && (htt_percent < 100)) {
++                uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
++                if (node[node_ix].CPUs_free > htt_discount) {
++                    node[node_ix].CPUs_free -= htt_discount;
++                } else {
++                    node[node_ix].CPUs_free = 0;
++                }
++            }
+             if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
+                 node[node_ix].CPUs_free = node[node_ix].CPUs_total;
+             }
++            sum_of_node_CPUs_free += node[node_ix].CPUs_free;
++            if (min_node_CPUs_free > node[node_ix].CPUs_free) {
++                min_node_CPUs_free = node[node_ix].CPUs_free;
++            }
++            if (max_node_CPUs_free < node[node_ix].CPUs_free) {
++                max_node_CPUs_free = node[node_ix].CPUs_free;
++            }
+             node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
+         } else {
+             node[node_ix].CPUs_free = 0;
+             node[node_ix].magnitude = 0;
+         }
+     }
++    avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
++    avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
++    double MBs_variance_sum = 0.0;
++    double CPUs_variance_sum = 0.0;
++    for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
++        double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
++        double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
++        MBs_variance_sum += MBs_diff * MBs_diff;
++        CPUs_variance_sum += CPUs_diff * CPUs_diff;
++    }
++    double MBs_variance = MBs_variance_sum / (num_nodes);
++    double CPUs_variance = CPUs_variance_sum / (num_nodes);
++    stddev_node_MBs_free = sqrt(MBs_variance);
++    stddev_node_CPUs_free = sqrt(CPUs_variance);
+     if (log_level >= LOG_INFO) {
+         show_nodes();
+     }
+@@ -1316,7 +1664,7 @@ typedef struct stat_data {
+     int64_t num_threads;  // 19
+     int64_t itrealvalue;
+     uint64_t starttime;
+-    uint64_t vsize;
++    uint64_t vsize;       // 22
+     int64_t rss;          // 23
+     uint64_t rsslim;
+     uint64_t startcode;
+@@ -1361,10 +1709,11 @@ process_data_p get_stat_data_for_pid(int
+         return NULL;
+     }
+     close(fd);
++    uint64_t val;
+     char *p = buf;
+     static process_data_t data;
+     // Get PID from field 0
+-    uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.pid = val;
+     // Copy comm from field 1
+     while (*p == ' ') { p++; }
+@@ -1373,23 +1722,27 @@ process_data_p get_stat_data_for_pid(int
+     // Skip fields 2 through 12
+     for (int ix = 0;  (ix < 11);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
+     // Get utime from field 13 for cpu_util
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.cpu_util = val;
+     // Get stime from field 14 to add on to cpu_util (which already has utime)
+     while (*p == ' ') { p++; }
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.cpu_util += val;
+     // Skip fields 15 through 18
+     while (*p == ' ') { p++; }
+     for (int ix = 0;  (ix < 4);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
+     // Get num_threads from field 19
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.num_threads = val;
+-    // Skip fields 20 through 22
++    // Skip fields 20 through 21
+     while (*p == ' ') { p++; }
+-    for (int ix = 0;  (ix < 3);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
++    for (int ix = 0;  (ix < 2);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
++    // Get vsize from field 22 to compute MBs_size
++    CONVERT_DIGITS_TO_NUM(p, val);
++    data.MBs_size = val / MEGABYTE;
+     // Get rss from field 23 to compute MBs_used
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    while (*p == ' ') { p++; }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
+     // Return pointer to data
+     return &data;
+@@ -1471,20 +1824,79 @@ int update_processes() {
+ }
+
+
++int initialize_mem_node_list(process_data_p p) {
++    // Parameter p is a pointer to an element in the hash table
++    if ((!p) || (p->pid < 1)) {
++        numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
++        exit(EXIT_FAILURE);
++    }
++    int n = 0;
++    char fname[FNAME_SIZE];
++    char buf[BIG_BUF_SIZE];
++    CLEAR_NODE_LIST(p->node_list_p);
++    snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
++    int fd = open(fname, O_RDONLY, 0);
++    if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
++        close(fd);
++        char *list_str_p = strstr(buf, "Mems_allowed_list:");
++        if (!list_str_p) {
++            numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
++            exit(EXIT_FAILURE);
++        }
++        list_str_p += 18;
++        while (!isdigit(*list_str_p)) { list_str_p++; }
++        n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
++    } else {
++        numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
++        return 0;  // Assume the process terminated
++    }
++    if (n < num_nodes) {
++        // If process already bound to a subset of nodes when we discover it,
++        // set initial bind_time_stamp to 30 minutes ago...
++        p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
++    }
++    return n;
++}
++
++
+
+-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
+-    char buf[BUF_SIZE];
+-    char buf2[BUF_SIZE];
++
++uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
++    int64_t needed_mem;
++    int64_t needed_cpu;
++    int64_t excess_mem;
++    int64_t excess_cpu;
++    if (MBs_free > mbs) {
++        needed_mem = mbs;
++        excess_mem = MBs_free - mbs;
++    } else {
++        needed_mem = MBs_free;
++        excess_mem = 0;
++    }
++    if (CPUs_free > cpus) {
++        needed_cpu = cpus;
++        excess_cpu = CPUs_free - cpus;
++    } else {
++        needed_cpu = CPUs_free;
++        excess_cpu = 0;
++    }
++    // Weight the available resources, and then calculate magnitude as
++    // product of available CPUs and available MBs.
++    int64_t memfactor = (needed_mem * 10 + excess_mem * 3);
++    int64_t cpufactor = (needed_cpu * 8 + excess_cpu * 1);
++    numad_log(LOG_DEBUG, "    Node[%d]: mem: %ld  cpu: %ld\n", ix, memfactor, cpufactor);
++    return (memfactor * cpufactor);
++}
++
++
++id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
+     if (log_level >= LOG_DEBUG) {
+         numad_log(LOG_DEBUG, "PICK NODES FOR:  PID: %d,  CPUs %d,  MBs %d\n", pid, cpus, mbs);
+     }
+-    int num_existing_mems = 0;
+-    static id_list_p existing_mems_list_p;
+-    CLEAR_LIST(existing_mems_list_p);
+-    uint64_t time_stamp = get_time_stamp();
++    char buf[BUF_SIZE];
++    uint64_t process_CPUs = 0;
+     static node_data_p tmp_node;
+     static uint64_t *process_MBs;
+-    static uint64_t *saved_magnitude_for_node;
+     static int process_MBs_num_nodes;
+     // See if dynamic structures need to grow.
+     if (process_MBs_num_nodes < num_nodes + 1) {
+@@ -1492,121 +1904,25 @@ id_list_p pick_numa_nodes(int pid, int c
+         // The "+1 node" is for accumulating interleaved memory
+         process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
+         tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
+-        saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
+-        if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
++        if ((process_MBs == NULL) || (tmp_node == NULL)) {
+             numad_log(LOG_CRIT, "process_MBs realloc failed\n");
+             exit(EXIT_FAILURE);
+         }
+     }
++
+     // For existing processes, get miscellaneous process specific details
+     int pid_ix;
+     process_data_p p = NULL;
+     if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
+         p = &process_hash_table[pid_ix];
+-        // Quick rejection if this process has interleaved memory, but recheck it once an hour...
+-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
+-        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
+-          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
+-            }
+-            return NULL;
+-        }
+-        // Get cpuset name for this process, and existing mems binding, if any.
+-        char fname[FNAME_SIZE];
+-        snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
+-        FILE *fs = fopen(fname, "r");
+-        if (!fs) {
+-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
+-            return NULL;  // Assume the process terminated?
+-        }
+-        if (!fgets(buf, BUF_SIZE, fs)) {
+-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
+-            fclose(fs);
+-            return NULL;  // Assume the process terminated?
+-        }
+-        fclose(fs);
+-        ELIM_NEW_LINE(buf);
+-        if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
+-            if (p->cpuset_name != NULL) {
+-                free(p->cpuset_name);
+-            }
+-            p->cpuset_name = strdup(buf);
+-        }
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
+-        }
+-        snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
+-        fs = fopen(fname, "r");
+-        if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
+-            fclose(fs);
+-            num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
+-            if (log_level >= LOG_DEBUG) {
+-                str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
+-                numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
+-            }
+-        }
+-        // If this process was just recently bound, enforce a minimum delay
+-        // period between repeated attempts to potentially move the memory.
+-        // FIXME: ?? might this retard appropriate process expansion too much?
+-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
+-        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
+-            // Skip re-evaluation because we just did it recently.
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
+-            }
+-            return NULL;
+-        }
+-        // Look for short cut because of duplicate bindings.  If we have bound
+-        // this process to the same nodes multiple times already, and the load
+-        // on those nodes still seems acceptable, skip the rest of this and
+-        // just return NULL to indicate no change needed.  FIXME: should figure
+-        // out what can change that would make a rebinding desirable (e.g. (1)
+-        // some process gets sub-optimal allocation on busy machine which
+-        // subsequently becomes less busy leaving disadvantaged process. (2)
+-        // node load imbalance, (3) any process split across nodes which should
+-        // fit within a single node.) For now, just expire the dup_bid_count
+-        // occasionally, which is a reasonably good mitigation.
+-        // So, check to see if we should decay the dup_bind_count...
+-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
+-        if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
+-            p->dup_bind_count -= 1;
+-        }
+-        // Now, look for short cut because of duplicate bindings
+-        if (p->dup_bind_count > 0) {
+-            int node_id = 0;
+-            int nodes_have_cpu = 1;
+-            int nodes_have_ram = 1;
+-            int n = num_existing_mems;
+-            int min_resource_pct = 100 - target_utilization;
+-            if (min_resource_pct < 5) {
+-                min_resource_pct = 5;
+-            }
+-            while (n) {
+-                if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-                    nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
+-                    nodes_have_ram &= ((100 * node[node_id].MBs_free  / node[node_id].MBs_total)  >= (min_resource_pct));
+-                    n -= 1;
+-                }
+-                node_id += 1;
+-            }
+-            if ((nodes_have_cpu) && (nodes_have_ram)) {
+-                if (log_level >= LOG_DEBUG) {
+-                    numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
+-                }
+-                return NULL;
+-            }
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
+-            }
+-        }
+-        // Fourth, add up per-node memory in use by this process. This scanning
+-        // is expensive and should be minimized.  Also, old kernels dismantle
+-        // transparent huge pages while producing the numa_maps memory
+-        // information!
++        // Correct current CPUs amount for utilization factor inflation
++        process_CPUs = (cpus * target_utilization) / 100;
++        // Add up per-node memory in use by this process.
++        // This scanning is expensive and should be minimized.
+         memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
++        char fname[FNAME_SIZE];
+         snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
+-        fs = fopen(fname, "r");
++        FILE *fs = fopen(fname, "r");
+         if (!fs) {
+             numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
+             return NULL;  // Assume the process terminated
+@@ -1645,84 +1961,103 @@ id_list_p pick_numa_nodes(int pid, int c
+         fclose(fs);
+         for (int ix = 0;  (ix <= num_nodes);  ix++) {
+             process_MBs[ix] /= MEGABYTE;
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
++            if (p->bind_time_stamp) {
++                if ((process_MBs[ix]) && (!ID_IS_IN_LIST(ix, p->node_list_p))) {
++                    // FIXME: If process previously bound, but memory appears
++                    // to exist where it should not, this might identify
++                    // processes for which the kernel does not move all the
++                    // memory for whatever reason....  Must check for
++                    // significant amount before doing anything about it,
++                    // however, since memory for libraries, etc, can get moved
++                    // around.
++                }
++            } else {
++                // If process has not yet been bound, set node list to existing nodes with memory
++                if (process_MBs[ix]) {
++                    ADD_ID_TO_LIST(ix, p->node_list_p);
++                } else {
++                    CLR_ID_IN_LIST(ix, p->node_list_p);
++                }
++            }
++            if ((log_level >= LOG_DEBUG) && (process_MBs[ix] > 0)) {
++                if (ix == num_nodes) {
++                    numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, process_MBs[ix]);
++                } else {
++                    numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
++                }
+             }
+         }
+         if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
+             // Mark this process as having interleaved memory so we do not
+-            // merge the interleaved memory.  Time stamp it as done.
++            // merge the interleaved memory.  Time stamp it as done and return.
+             p->flags |= PROCESS_FLAG_INTERLEAVED;
+             p->bind_time_stamp = get_time_stamp();
+             if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
++                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
+             }
+             return NULL;
+         }
+     }  // end of existing PID conditional
++
+     // Make a copy of node available resources array.  Add in info specific to
+     // this process to equalize available resource quantities wrt locations of
+-    // resources already in use by this process.  Inflate the value of already
+-    // assigned memory by approximately 3/2, because moving memory is
+-    // expensive.  Average the amount of CPUs_free across the existing nodes
+-    // used, because the threads are free to move around in that domain.  After
+-    // calculating combined magnitude of available resources, bias the values
+-    // towards existing locations for this process.
+-    int target_using_all_nodes = 0;
+-    uint64_t node_CPUs_free_for_this_process = 0;
++    // resources already in use by this process.  After calculating weighted
++    // magnitude of available resources, bias the values towards existing
++    // locations for this process.
+     memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
+-    if (num_existing_mems > 0) {
+-        node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
+-        int node_id = 0;
+-        int n = num_existing_mems;
+-        while (n) {
+-            if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-                node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
+-                n -= 1;
+-            }
+-            node_id += 1;
+-        }
+-        // Divide to get average CPUs_free for the nodes in use by process
+-        node_CPUs_free_for_this_process /= num_existing_mems;
+-    }
+     for (int ix = 0;  (ix < num_nodes);  ix++) {
+-        if (pid > 0) {
+-            tmp_node[ix].MBs_free  += ((process_MBs[ix] * 12) / 8);
+-        }
+-        if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
+-            tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
++        // Add back (biased) memory already used by this process on this node
++        tmp_node[ix].MBs_free  += ((process_MBs[ix] * 8) / 8);    // FIXME: apply bias here?
++        if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
++            tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
++        }
++        // Add back CPU in proportion to amount of memory already used on this
++        // node Making assumption here that CPU execution threads are actually
++        // running on the same nodes where memory is assigned...  FIXME: should
++        // we perhaps do this only if process already explicitly bound?
++        uint64_t prorated_CPU = (process_CPUs * process_MBs[ix]) / mbs;
++        if ((log_level >= LOG_DEBUG) && (prorated_CPU > 0)) {
++            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, prorated_CPU);
+         }
++        tmp_node[ix].CPUs_free += prorated_CPU;
+         if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
+             tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
+         }
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
++        if (tmp_node[ix].CPUs_free < 1) {
++            // enforce 1/100th CPU minimum
++            tmp_node[ix].CPUs_free = 1;
+         }
+-        // Calculate magnitude as product of available CPUs and available MBs
+-        tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
++        // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld  cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
++        tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
+         // Bias combined magnitude towards already assigned nodes
+-        if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
+-            tmp_node[ix].magnitude *= 9;
+-            tmp_node[ix].magnitude /= 8;
++        if ((pid > 0) && (ID_IS_IN_LIST(ix, p->node_list_p))) {
++            tmp_node[ix].magnitude *= 17;
++            tmp_node[ix].magnitude /= 16;
+         }
+-        // Save the current magnitudes
+-        saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
+     }
+-    // OK, figure out where to get resources for this request.
+-    static id_list_p target_node_list_p;
+-    CLEAR_LIST(target_node_list_p);
++
++    // Figure out where to get resources for this request.
+     int prev_node_used = -1;
+-    // Continue to allocate more resources until request are met.
+-    // OK if not not quite all the CPU request is met.
+-    // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
+-    int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200;
+-    if (pid <= 0) {
+-        // If trying to find resources for pre-placement advice request, do not
+-        // underestimate the amount of CPUs needed.  Instead, err on the side
+-        // of providing too many resources.  So, no flexing here...
+-        cpu_flex = 0;
++    static id_list_p target_node_list_p;
++    CLEAR_NODE_LIST(target_node_list_p);
++    // Establish a CPU flex fudge factor, on the presumption it is OK if not
++    // quite all the CPU request is met.  However, if trying to find resources
++    // for pre-placement advice request, do not underestimate the amount of
++    // CPUs needed.  Instead, err on the side of providing too many resources.
++    int cpu_flex = 0;
++    if ((pid > 0) && (target_utilization < 100)) {
++        // FIXME: Is half of the utilization margin a good amount of CPU flexing?
++        cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200;
++    }
++    // Figure out minimum number of nodes required
++    int mem_req_nodes = ceil((double)mbs  / (double)node[0].MBs_total);
++    int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total);
++    int min_req_nodes = mem_req_nodes;
++    if (min_req_nodes < cpu_req_nodes) {
++        min_req_nodes = cpu_req_nodes;
+     }
+-    while ((mbs > 0) || (cpus > cpu_flex)) {
++    // Continue to allocate more resources until request are met.
++    while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
+         if (log_level >= LOG_DEBUG) {
+             numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
+         }
+@@ -1760,22 +2095,18 @@ id_list_p pick_numa_nodes(int pid, int c
+             // last one we used.  This is not going to make progress...  So
+             // just punt and use everything.
+             OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
+-            target_using_all_nodes = 1;
+             break;
+         }
+         prev_node_used = tmp_node[0].node_id;
+         ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
+-        if (log_level >= LOG_DEBUG) {
+-            str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
+-            str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+-            numad_log(LOG_DEBUG, "Existing nodes: %s  Target nodes: %s\n", buf, buf2);
+-        }
++        min_req_nodes -= 1;
+         if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
+             // Apparently we must use all resource nodes...
+-            target_using_all_nodes = 1;
+             break;
+         }
+-#define MBS_MARGIN 10
++        // "Consume" the resources on this node
++#define CPUS_MARGIN 0
++#define MBS_MARGIN 100
+         if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
+             tmp_node[0].MBs_free -= mbs;
+             mbs = 0;
+@@ -1783,7 +2114,6 @@ id_list_p pick_numa_nodes(int pid, int c
+             mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
+             tmp_node[0].MBs_free = MBS_MARGIN;
+         }
+-#define CPUS_MARGIN 0
+         if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
+             tmp_node[0].CPUs_free -= cpus;
+             cpus = 0;
+@@ -1791,126 +2121,52 @@ id_list_p pick_numa_nodes(int pid, int c
+             cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
+             tmp_node[0].CPUs_free = CPUS_MARGIN;
+         }
+-        tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
++        tmp_node[0].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[0].MBs_free, tmp_node[0].CPUs_free);
+     }
+-    // If this existing process is already located where we want it, and almost
+-    // all memory is already moved to those nodes, then return NULL indicating
+-    // no need to change binding this time.
+-    if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
+-        // May not need to change binding.  However, if there is any significant
+-        // memory still on non-target nodes, advise the bind anyway because
+-        // there are some scenarios when the kernel will not move it all the
+-        // first time.
+-        if (!target_using_all_nodes) {
+-            p->dup_bind_count += 1;
+-            for (int ix = 0;  (ix < num_nodes);  ix++) {
+-                if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
+-                    goto try_memory_move_again;
+-                }
+-            }
+-            // We will accept these memory locations.  Stamp it as done.
+-            p->bind_time_stamp = get_time_stamp();
+-        }
+-        // Skip rebinding either because practically all memory is in the
+-        // target nodes, or because we are stuck using all the nodes.
++
++    // If this existing process is already located where we want it, then just
++    // return NULL indicating no need to change binding this time.
++    if ((pid > 0) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
+         if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
++            numad_log(LOG_DEBUG, "Process %d already bound to target nodes.\n", p->pid);
+         }
++        p->bind_time_stamp = get_time_stamp();
+         return NULL;
+-    } else {
+-        // Either a non-existing process, or a new binding for an existing process.
+-        if (p != NULL) {
+-            // Must be a new binding for an existing process, so reset dup_bind_count.
+-            p->dup_bind_count = 0;
+-        }
+-    }
+-    // See if this proposed move will make a significant difference.
+-    // If not, return null instead of advising the move.
+-    uint64_t target_magnitude = 0;
+-    uint64_t existing_magnitude = 0;
+-    int num_target_nodes   = NUM_IDS_IN_LIST(target_node_list_p);
+-    int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
+-    /* FIXME: this expansion seems to cause excessive growth
+-     * So calculate the improvement before hastily expanding nodes.
+-    if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
+-    */
+-    int node_id = 0;
+-    int n = num_existing_nodes + num_target_nodes;
+-    while (n) {
+-        if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
+-            target_magnitude += saved_magnitude_for_node[node_id];
+-            n -= 1;
+-        }
+-        if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-            existing_magnitude += saved_magnitude_for_node[node_id];
+-            n -= 1;
+-        }
+-        node_id += 1;
+-    }
+-    if (existing_magnitude > 0) {
+-        uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
+-        if (magnitude_change < 0) {
+-            magnitude_change = -(magnitude_change);
+-        }
+-        if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
+-            // Not significant enough percentage change to do rebind
+-            if (log_level >= LOG_DEBUG) {
+-                str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
+-                str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+-                numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
+-                    pid, buf, buf2, magnitude_change);
+-            }
+-            // We decided this is almost good enough.  Stamp it as done.
+-            p->bind_time_stamp = get_time_stamp();
+-            return NULL;
+-        }
+     }
+-    if ((pid <= 0) && (num_target_nodes <= 0)) {
+-        // Always provide at least one node for pre-placement advice
++    // Must always provide at least one node for pre-placement advice
++    // FIXME: verify this can happen only if no resources requested...
++    if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
+         ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
+     }
+-try_memory_move_again:
+-    str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
++    // Log advice, and return target node list
++    if ((pid > 0) && (p->bind_time_stamp)) {
++        str_from_id_list(buf,  BUF_SIZE, p->node_list_p);
++    } else {
++        str_from_id_list(buf,  BUF_SIZE, all_nodes_list_p);
++    }
++    char buf2[BUF_SIZE];
+     str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+     char *cmd_name = "(unknown)";
+     if ((p) && (p->comm)) {
+         cmd_name = p->comm;
+     }
+     numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
++
++    if (pid > 0) {
++        // FIXME: Consider moving this out to caller??
++        COPY_LIST(target_node_list_p, p->node_list_p);
++    }
+     return target_node_list_p;
+ }
+
+
+
+-void show_processes(process_data_p *ptr, int nprocs) {
+-    time_t ts = time(NULL);
+-    fprintf(log_fs, "%s", ctime(&ts));
+-    fprintf(log_fs, "Candidates: %d\n", nprocs);
+-    for (int ix = 0;  (ix < nprocs);  ix++) {
+-        process_data_p p = ptr[ix];
+-        char buf[BUF_SIZE];
+-        snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
+-        FILE *fs = fopen(buf, "r");
+-        buf[0] = '\0';
+-        if (fs) {
+-            if (fgets(buf, BUF_SIZE, fs)) {
+-                ELIM_NEW_LINE(buf);
+-            }
+-            fclose(fs);
+-        }
+-        fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
+-            p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
+-        }
+-    fprintf(log_fs, "\n");
+-    fflush(log_fs);
+-}
+-
+-
+
+ int manage_loads() {
++    uint64_t time_stamp = get_time_stamp();
+     // Use temporary index to access and sort hash table entries
+-    static process_data_p *pindex;
+     static int pindex_size;
++    static process_data_p *pindex;
+     if (pindex_size < process_hash_table_size) {
+         pindex_size = process_hash_table_size;
+         pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
+@@ -1923,34 +2179,69 @@ int manage_loads() {
+         return min_interval / 2;
+     }
+     memset(pindex, 0, pindex_size * sizeof(process_data_p));
+-    // Copy live candidate pointers to the index for sorting, etc
++    // Copy live candidate pointers to the index for sorting
++    // if they meet the threshold for memory usage and CPU usage.
+     int nprocs = 0;
++    long sum_CPUs_used = 0;
+     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+         process_data_p p = &process_hash_table[ix];
+-        if (p->pid) {
++        if ((p->pid) && (p->CPUs_used * p->MBs_used > CPU_THRESHOLD * MEMORY_THRESHOLD)) {
+             pindex[nprocs++] = p;
++            sum_CPUs_used += p->CPUs_used;
++            // Initialize node list, if not already done for this process.
++            if (p->node_list_p == NULL) {
++                initialize_mem_node_list(p);
++            }
+         }
+     }
+-    // Sort index by amount of CPU used * amount of memory used.  Not expecting
+-    // a long list here.  Use a simple sort -- however, sort into bins,
+-    // treating values within 10% as aquivalent.  Within bins, order by
+-    // bind_time_stamp so oldest bound will be higher priority to evaluate.
++    // Order candidate considerations using timestamps and magnitude: amount of
++    // CPU used * amount of memory used.  Not expecting a long list here.  Use
++    // a simplistic sort -- however move all not yet bound to front of list and
++    // order by decreasing magnitude.  Previously bound processes follow in
++    // bins of increasing magnitude treating values within 20% as aquivalent.
++    // Within bins, order by bind_time_stamp so oldest bound will be higher
++    // priority to evaluate.  Start by moving all unbound to beginning.
++    int num_unbound = 0;
+     for (int ij = 0;  (ij < nprocs);  ij++) {
++        if (pindex[ij]->bind_time_stamp == 0) {
++            process_data_p tmp = pindex[num_unbound];
++            pindex[num_unbound++] = pindex[ij];
++            pindex[ij] = tmp;
++        }
++    }
++    // Sort all unbound so biggest magnitude comes first
++    for (int ij = 0;  (ij < num_unbound);  ij++) {
++        int best = ij;
++        for (int ik = ij + 1;  (ik < num_unbound);  ik++) {
++            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_size);
++            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size);
++            if (ik_mag <= best_mag) continue;
++            best = ik;
++        }
++        if (best != ij) {
++            process_data_p tmp = pindex[ij];
++            pindex[ij] = pindex[best];
++            pindex[best] = tmp;
++        }
++    }
++    // Sort the remaining candidates into bins of increasting magnitude, and by
++    // timestamp within bins.
++    for (int ij = num_unbound;  (ij < nprocs);  ij++) {
+         int best = ij;
+         for (int ik = ij + 1;  (ik < nprocs);  ik++) {
+-            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
+-            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
++            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_size);
++            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size);
+             uint64_t  min_mag = ik_mag;
+             uint64_t diff_mag = best_mag - ik_mag;
+             if (diff_mag < 0) {
+                 diff_mag = -(diff_mag);
+                 min_mag = best_mag;
+             }
+-            if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
+-                // difference > 10 percent.  Use strict ordering
+-                if (ik_mag <= best_mag) continue;
++            if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
++                // difference > 20 percent.  Use magnitude ordering
++                if (ik_mag >= best_mag) continue;
+             } else {
+-                // difference within 10 percent.  Sort these by bind_time_stamp.
++                // difference within 20 percent.  Sort these by bind_time_stamp.
+                 if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
+             }
+             best = ik;
+@@ -1961,23 +2252,69 @@ int manage_loads() {
+             pindex[best] = tmp;
+         }
+     }
++    // Show the candidate processes in the log file
+     if ((log_level >= LOG_INFO) && (nprocs > 0)) {
+-        show_processes(pindex, nprocs);
++        numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
++        for (int ix = 0;  (ix < nprocs);  ix++) {
++            process_data_p p = pindex[ix];
++            char buf[BUF_SIZE];
++            str_from_id_list(buf, BUF_SIZE, p->node_list_p);
++            fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
++                p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
++            }
++        fflush(log_fs);
+     }
+-    // Estimate desired size and make resource requests for each significant process
++    // Estimate desired size (+ margin capacity) and
++    // make resource requests for each candidate process
+     for (int ix = 0;  (ix < nprocs);  ix++) {
+         process_data_p p = pindex[ix];
+-        if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
+-            break; // No more significant processes worth worrying about...
++        // If this process was recently bound, enforce a three-minute minimum
++        // delay between repeated attempts to potentially move the process.
++        // FIXME: make this delay contingent on node resource equity?   Or,
++        // maybe change in running averages? Perhaps detect change in averages,
++        // or look at stddev? What is a good range for the delay?  Discrete or
++        // continuous?
++#define MIN_DELAY_FOR_REEVALUATION (180 * ONE_HUNDRED)
++        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
++            // Skip re-evaluation because we just did it recently.
++            if (log_level >= LOG_DEBUG) {
++                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
++            }
++            continue;
++        }
++        // If this process has interleaved memory, recheck it only every 30 minutes...
++#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
++        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
++          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
++            if (log_level >= LOG_DEBUG) {
++                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
++            }
++            continue;
+         }
+-        int mb_request  =  (p->MBs_used * 100) / target_utilization;
+-        int cpu_request = (p->CPUs_used * 100) / target_utilization;
+-        // Do not give a process more CPUs than it has threads!
+-        // FIXME: For guest VMs, should limit max to VCPU threads. Will
+-        // need to do something more intelligent with guest IO threads
+-        // when eventually considering devices and IRQs.
++        // Expand resources needed estimate using target_utilization factor.
++        // Start with the CPUs actually used (capped by number of threads) for
++        // CPUs required, but use the process virtual memory size for MBs
++        // requirement, (We previously used the RSS for MBs needed, but that
++        // caused problems with processes that had quickly expanding memory
++        // usage which also needed to cross NUMA boundaries.  The downside of
++        // this choice is we might not pack processes as tightly as possible
++        // anymore.  Hopefully this will be a relatively rare occurence in
++        // practice.  KVM guests should not be significantly over-provisioned
++        // with memory they will never use!)
++        int mem_target_utilization = target_utilization;
++        int cpu_target_utilization = target_utilization;
++        // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
++        if (mem_target_utilization > 100) {
++            mem_target_utilization = 100;
++        }
++        int mb_request  =  (p->MBs_size * 100) / mem_target_utilization;
++        int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
++        // But do not give a process more CPUs than it has threads!
+         int thread_limit = p->num_threads;
+-        // If process looks like a KVM guest, try to limit to number of vCPU threads
++        // If process looks like a KVM guest, try to limit thread count to the
++        // number of vCPU threads.  FIXME: Will need to do something more
++        // intelligent than this with guest IO threads when eventually
++        // considering devices and IRQs.
+         if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
+             int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
+             if (thread_limit > kvm_vcpu_threads) {
+@@ -1988,18 +2325,18 @@ int manage_loads() {
+         if (cpu_request > thread_limit) {
+             cpu_request = thread_limit;
+         }
++        // OK, now pick NUMA nodes for this process and bind it!
+         pthread_mutex_lock(&node_info_mutex);
+-        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
+-        // FIXME: ?? copy node_list_p to shorten mutex region?
+-        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
+-            // Shorten interval if actively moving processes
++        int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
++        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
++        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
+             pthread_mutex_unlock(&node_info_mutex);
+-            p->bind_time_stamp = get_time_stamp();
++            // Return minimum interval when actively moving processes
+             return min_interval;
+         }
+         pthread_mutex_unlock(&node_info_mutex);
+     }
+-    // Return maximum interval if no process movement
++    // Return maximum interval when no process movement
+     return max_interval;
+ }
+
+@@ -2013,6 +2350,18 @@ void *set_dynamic_options(void *arg) {
+         msg_t msg;
+         recv_msg(&msg);
+         switch (msg.body.cmd) {
++        case 'C':
++            use_inactive_file_cache = (msg.body.arg1 != 0);
++            if (use_inactive_file_cache) {
++                numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
++            } else {
++                numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
++            }
++            break;
++        case 'H':
++            thp_scan_sleep_ms = msg.body.arg1;
++            set_thp_scan_sleep_ms(thp_scan_sleep_ms);
++            break;
+         case 'i':
+             min_interval = msg.body.arg1;
+             max_interval = msg.body.arg2;
+@@ -2055,6 +2404,11 @@ void *set_dynamic_options(void *arg) {
+                 numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
+             }
+             break;
++        case 't':
++            numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
++            htt_percent = msg.body.arg1;
++            node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
++            break;
+         case 'u':
+             numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
+             target_utilization = msg.body.arg1;
+@@ -2064,7 +2418,7 @@ void *set_dynamic_options(void *arg) {
+                                     msg.body.arg1, msg.body.arg2);
+             pthread_mutex_lock(&node_info_mutex);
+             update_nodes();
+-            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
++            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
+             str_from_id_list(buf, BUF_SIZE, node_list_p);
+             pthread_mutex_unlock(&node_info_mutex);
+             send_msg(msg.body.src_pid, 'w', 0, 0, buf);
+@@ -2134,20 +2488,28 @@ void parse_two_arg_values(char *p, int *
+
+ int main(int argc, char *argv[]) {
+     int opt;
++    int C_flag = 0;
+     int d_flag = 0;
++    int H_flag = 0;
+     int i_flag = 0;
+     int K_flag = 0;
+     int l_flag = 0;
+     int p_flag = 0;
+     int r_flag = 0;
+     int S_flag = 0;
++    int t_flag = 0;
+     int u_flag = 0;
+     int v_flag = 0;
+     int w_flag = 0;
+     int x_flag = 0;
++    int tmp_int = 0;
+     long list_pid = 0;
+-    while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
++    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
+         switch (opt) {
++        case 'C':
++            C_flag = 1;
++            use_inactive_file_cache = (atoi(optarg) != 0);
++            break;
+         case 'd':
+             d_flag = 1;
+             log_level = LOG_DEBUG;
+@@ -2158,6 +2520,17 @@ int main(int argc, char *argv[]) {
+         case 'h':
+             print_usage_and_exit(argv[0]);
+             break;
++        case 'H':
++            tmp_int = atoi(optarg);
++            if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
++                // 0 means do not change the system default value
++                H_flag = 1;
++                thp_scan_sleep_ms = tmp_int;
++            } else {
++		fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
++		exit(EXIT_FAILURE);
++	    }
++            break;
+         case 'i':
+             i_flag = 1;
+             parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
+@@ -2183,13 +2556,26 @@ int main(int argc, char *argv[]) {
+             include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
+             exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
+             break;
++        case 'R':
++            reserved_cpu_str = strdup(optarg);
++            break;
+         case 'S':
+             S_flag = 1;
+             scan_all_processes = (atoi(optarg) != 0);
+             break;
++        case 't':
++            tmp_int = atoi(optarg);
++            if ((tmp_int >= 0) && (tmp_int <= 100)) {
++                t_flag = 1;
++                htt_percent = tmp_int;
++            }
++            break;
+         case 'u':
+-            u_flag = 1;
+-            target_utilization = atoi(optarg);
++            tmp_int = atoi(optarg);
++            if ((tmp_int >= 10) && (tmp_int <= 130)) {
++                u_flag = 1;
++                target_utilization = tmp_int;
++            }
+             break;
+         case 'v':
+             v_flag = 1;
+@@ -2234,6 +2620,12 @@ int main(int argc, char *argv[]) {
+         // Daemon is already running.  So send dynamic options to persistant
+         // thread to handle requests, get the response (if any), and finish.
+         msg_t msg;
++        if (C_flag) {
++            send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
++        }
++        if (H_flag) {
++            send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
++        }
+         if (i_flag) {
+             send_msg(daemon_pid, 'i', min_interval, max_interval, "");
+         }
+@@ -2252,6 +2644,9 @@ int main(int argc, char *argv[]) {
+         if (S_flag) {
+             send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
+         }
++        if (t_flag) {
++            send_msg(daemon_pid, 't', htt_percent, 0, "");
++        }
+         if (u_flag) {
+             send_msg(daemon_pid, 'u', target_utilization, 0, "");
+         }
+@@ -2263,14 +2658,30 @@ int main(int argc, char *argv[]) {
+         if (x_flag) {
+             send_msg(daemon_pid, 'x', list_pid, 0, "");
+         }
+-    } else if (w_flag) {
+-        // Get pre-placement NUMA advice without starting daemon
++        close_log_file();
++        exit(EXIT_SUCCESS);
++    }
++    // No numad daemon running yet.
++    // First, make note of any reserved CPUs....
++    if (reserved_cpu_str != NULL) {
++        CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
++        int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
+         char buf[BUF_SIZE];
++        str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
++        numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
++        // turn reserved list into a negated mask for later ANDing use...
++        negate_list(reserved_cpu_mask_list_p);
++    }
++    // If it is a "-w" pre-placement request, handle that without starting
++    // the daemon.  Otherwise start the numad daemon.
++    if (w_flag) {
++        // Get pre-placement NUMA advice without starting daemon
+         update_nodes();
+         sleep(2);
+         update_nodes();
+         numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
+-        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
++        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
++        char buf[BUF_SIZE];
+         str_from_id_list(buf, BUF_SIZE, node_list_p);
+         fprintf(stdout, "%s\n", buf);
+         close_log_file();
+@@ -2278,6 +2689,7 @@ int main(int argc, char *argv[]) {
+     } else if (max_interval > 0) {
+         // Start the numad daemon...
+         check_prereqs(argv[0]);
++#if (!NO_DAEMON)
+         // Daemonize self...
+         daemon_pid = fork();
+         if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
+@@ -2298,9 +2710,21 @@ int main(int argc, char *argv[]) {
+         if (log_fs != stderr) {
+             fclose(stderr);
+         }
++#endif
++        // Set up signal handlers
++        struct sigaction sa;
++        memset(&sa, 0, sizeof(sa));
++        sa.sa_handler = sig_handler;
++        if (sigaction(SIGHUP, &sa, NULL)
++            || sigaction(SIGTERM, &sa, NULL)
++            || sigaction(SIGQUIT, &sa, NULL)) {
++            numad_log(LOG_CRIT, "sigaction does not work?\n");
++            exit(EXIT_FAILURE);
++        }
+         // Allocate initial process hash table
+         process_hash_table_expand();
+-        // Spawn thread to handle messages from subsequent invocation requests
++        // Spawn a thread to handle messages from subsequent invocation requests
++        // and also a lazy background thread to clean up obsolete cpusets.
+         pthread_mutex_init(&pid_list_mutex, NULL);
+         pthread_mutex_init(&node_info_mutex, NULL);
+         pthread_attr_t attr;
+@@ -2310,7 +2734,11 @@ int main(int argc, char *argv[]) {
+         }
+         pthread_t tid;
+         if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
+-            numad_log(LOG_CRIT, "pthread_create failure\n");
++            numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
++            exit(EXIT_FAILURE);
++        }
++        if (pthread_create(&tid, &attr, &clean_obsolete_cpusets, &tid) != 0) {
++            numad_log(LOG_CRIT, "pthread_create failure: cleaning thread\n");
+             exit(EXIT_FAILURE);
+         }
+         // Loop here forwever...
+@@ -2324,14 +2752,20 @@ int main(int argc, char *argv[]) {
+                 interval = manage_loads();
+             }
+             sleep(interval);
++            if (got_sigterm | got_sigquit) {
++                shut_down_numad();
++            }
++            if (got_sighup) {
++                got_sighup = 0;
++                close_log_file();
++                open_log_file();
++            }
+         }
+         if (pthread_attr_destroy(&attr) != 0) {
+             numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
+         }
+         pthread_mutex_destroy(&pid_list_mutex);
+         pthread_mutex_destroy(&node_info_mutex);
+-    } else {
+-        shut_down_numad();
+     }
+     exit(EXIT_SUCCESS);
+ }
+--- numad-0.5git/numad.8	2012-12-03 15:40:40.000000000 +0100
++++ new-rhel7/numad.8	2014-02-27 10:03:07.000000000 +0100
+@@ -8,9 +8,15 @@ management for efficient use of CPUs and
+ numad [\fI\-dhvV\fP]
+ .br
+ .LP
++numad  [\fI\-C 0|1\fP]
++.br
++.LP
+ numad  [\fI\-D non-standard-cgroup-mount-point\fP]
+ .br
+ .LP
++numad  [\fI\-H THP_hugepage_scan_sleep_ms\fP]
++.br
++.LP
+ numad  [\fI\-i [min_interval:]max_interval\fP]
+ .br
+ .LP
+@@ -26,9 +32,15 @@ numad  [\fI\-p PID\fP]
+ numad  [\fI\-r PID\fP]
+ .br
+ .LP
++numad  [\fI\-R reserved-CPU-list\fP]
++.br
++.LP
+ numad  [\fI\-S 0|1\fP]
+ .br
+ .LP
++numad  [\fI\-t logical_CPU_percent\fP]
++.br
++.LP
+ numad  [\fI\-u target_utilization\fP]
+ .br
+ .LP
+@@ -37,7 +49,6 @@ numad  [\fI\-w NCPUS[:MB]\fP]
+ .LP
+ numad  [\fI\-x PID\fP]
+ .br
+-
+ .SH "DESCRIPTION"
+ .LP
+ Numad is a system daemon that monitors NUMA topology and resource usage. It
+@@ -54,6 +65,13 @@ accesses will likely remain unpredictabl
+ performance.
+ .SH "OPTIONS"
+ .LP
++.TP
++\fB\-C\fR <\fI0|1\fP>
++This option controls whether or not numad treats inactive file cache as
++available memory. By default, numad assumes it can count inactive file cache as
++"free" memory when considering resources to match with processes.  Specify
++\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
++resource.
+ .TP
+ \fB\-d\fR
+ Debug output in log, sets the log level to LOG_DEBUG.  Same effect as \fI\-l 7\fP.
+@@ -65,6 +83,16 @@ numad.  This is not normally necessary.
+ \fB\-h\fR
+ Display usage help information and then exit.
+ .TP
++\fB\-H\fR  <\fITHP_scan_sleep_ms\fP>
++Set the desired transparent hugepage scan interval in ms.  The
++/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs tunable is
++usually set to 10000ms by the operating system.  The default is changed by
++numad to be 1000ms since it is helpful for the hugepage daemon to be more
++aggressive when memory moves between nodes.  If you don't like numad's choice
++of 1000ms, you can make the hugepage daemon more or less aggressive by
++specifying an alternate value with this option.  Setting this value to 100ms
++might improve some workloads which use many transparent hugepages.
++.TP
+ \fB\-i\fR <\fI[min_interval:]max_interval\fP>
+ Sets the time interval that numad waits between system scans, in seconds to
+ <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
+@@ -85,7 +113,9 @@ large in-memory database), you might get
+ .TP
+ \fB\-l\fR <\fIlog_level\fP>
+ Sets the log level to <\fIlog_level\fP>.  Reasonable choices are 5, 6, or 7.
+-The default value is 5.
++The default value is 5.  Note that CPU values are scaled by a factor of 100
++internally and in the numad log files.  Unfortunately, you don't actually have
++that many CPUs.
+ .TP
+ \fB\-p\fR <\fIPID\fP>
+ Add PID to explicit inclusion list of processes to consider for managing, if
+@@ -102,6 +132,12 @@ processes.  After daemon start, only one
+ process lists per subsequent numad invocation.  Use with \-S and \-p and \-x to
+ precisely control the scope of processes numad can manage.
+ .TP
++\fB\-R\fR <\fICPU_LIST\fP>
++Specify a list of CPUs that numad should assume are reserved for non-numad use.
++No processes will be bound to the specified CPUs by numad.  This option is
++effective only when starting numad.  You cannot change reserved CPUs
++dynamically while numad is already running.
++.TP
+ \fB\-S\fR <\fI0|1\fP>
+ This option controls whether numad scans all system processes or only the
+ processes on the explicit inclusion PID list.  The default is to scan all
+@@ -114,10 +150,19 @@ exclusion list).  Starting numad as
+ will limit scanning, and thus also automatic NUMA management, to only those
+ three explicitly specified processes.
+ .TP
++\fB\-t\fR  <\fIlogical_CPU_percent\fP>
++Determine the resource value of logical CPUs.  Hardware threads typically share
++most core resources, and so add only a fraction of CPU power for many
++workloads.  By default numad considers logical CPUs to be only 20 percent of a
++dedicated core.
++.TP
+ \fB\-u\fR  <\fItarget_utilization\fP>
+ Set the desired maximum consumption percentage of a node. Default is 85%.
+ Decrease the target value to maintain more available resource margin on each
+ node.  Increase the target value to more exhaustively consume node resources.
++It is possible to specify values up to 130 percent, to oversubscribe CPUs in
++the nodes, but memory utilization is capped at 100%.  Use oversubscription
++values carefully.
+ .TP
+ \fB\-v\fR
+ Verbose output in log, sets the log level to LOG_INFO.  Same effect as \fI\-l 6\fP.
+@@ -159,18 +204,21 @@ numad can manage.
+ None.
+ .SH "EXAMPLES"
+ .LP
+-Numad is normally run as a system daemon and should be managed by the
++Numad can be run as a system daemon and can be managed by the
+ standard init mechanisms of the host.
+ .LP
+ If interactive (manual) control is desired, you can start the daemon manually by typing:
+ .LP
+ /usr/bin/numad
+ .LP
+-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
++Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
++.LP
++You can terminate numad from running by typing:
++.LP
++/usr/bin/numad -i0
+ .SH "AUTHORS"
+ .LP
+ Bill Gray <bgray@redhat.com>
+ .SH "SEE ALSO"
+ .LP
+ numactl(8)
+-
diff --git a/SOURCES/numad-0.5git-version.patch b/SOURCES/numad-0.5git-version.patch
new file mode 100644
index 00000000..27d36d7d
--- /dev/null
+++ b/SOURCES/numad-0.5git-version.patch
@@ -0,0 +1,2639 @@
+diff -rup numad-0.5git/numad.8 numad-0.5git-new/numad.8
+--- numad-0.5git/numad.8	2012-12-03 15:40:40.000000000 +0100
++++ numad-0.5git-new/numad.8	2016-08-30 08:45:19.000000000 +0200
+@@ -1,45 +1,56 @@
+ .TH "numad" "8" "1.0.0" "Bill Gray" "Administration"
+-.SH "numad"
+-.LP
++.SH "NAME"
++.LP
+ numad \- A user\-level daemon that provides placement advice and process
+ management for efficient use of CPUs and memory on systems with NUMA topology.
+-.SH "SYNTAX"
+-.LP
++.SH "SYNOPSIS"
++.LP
+ numad [\fI\-dhvV\fP]
+-.br
+-.LP
+-numad  [\fI\-D non-standard-cgroup-mount-point\fP]
+-.br
+-.LP
++.br
++.LP
++numad  [\fI\-C 0|1\fP]
++.br
++.LP
++numad  [\fI\-H THP_hugepage_scan_sleep_ms\fP]
++.br
++.LP
+ numad  [\fI\-i [min_interval:]max_interval\fP]
+-.br
+-.LP
++.br
++.LP
+ numad  [\fI\-K 0|1\fP]
+-.br
+-.LP
++.br
++.LP
+ numad  [\fI\-l log_level\fP]
+-.br
+-.LP
++.br
++.LP
++numad  [\fI\-m target_memory_locality\fP]
++.br
++.LP
+ numad  [\fI\-p PID\fP]
+-.br
+-.LP
++.br
++.LP
+ numad  [\fI\-r PID\fP]
+-.br
+-.LP
++.br
++.LP
++numad  [\fI\-R reserved-CPU-list\fP]
++.br
++.LP
+ numad  [\fI\-S 0|1\fP]
+-.br
+-.LP
++.br
++.LP
++numad  [\fI\-t logical_CPU_percent\fP]
++.br
++.LP
+ numad  [\fI\-u target_utilization\fP]
+-.br
+-.LP
++.br
++.LP
+ numad  [\fI\-w NCPUS[:MB]\fP]
+-.br
+-.LP
++.br
++.LP
+ numad  [\fI\-x PID\fP]
+-.br
+-
++.br
+ .SH "DESCRIPTION"
+-.LP
++.LP
+ Numad is a system daemon that monitors NUMA topology and resource usage. It
+ will attempt to locate processes for efficient NUMA locality and affinity,
+ dynamically adjusting to changing system conditions.  Numad also provides
+@@ -53,25 +64,42 @@ large in-memory database application, fo
+ accesses will likely remain unpredictable -- numad will probably not improve
+ performance.
+ .SH "OPTIONS"
+-.LP
+-.TP
++.LP
++.TP
++\fB\-C\fR <\fI0|1\fP>
++This option controls whether or not numad treats inactive file cache as
++available memory. By default, numad assumes it can count inactive file cache as
++"free" memory when considering resources to match with processes.  Specify
++\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
++resource.
++.TP
+ \fB\-d\fR
+ Debug output in log, sets the log level to LOG_DEBUG.  Same effect as \fI\-l 7\fP.
+ .TP
+-\fB\-D\fR <\fInon-standard-cgroup-mount-point\fP>
+-This option can be used to communicate a non-standard cgroup mount point to
+-numad.  This is not normally necessary.
+-.TP
+ \fB\-h\fR
+ Display usage help information and then exit.
+-.TP
++.TP
++\fB\-H\fR  <\fITHP_scan_sleep_ms\fP>
++Set the desired transparent hugepage scan interval in ms.  The
++.na
++/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs
++.ad
++tunable is usually set to 10000ms by the operating system.  The default is
++changed by numad to be 1000ms since it is helpful for the hugepage daemon to be
++more aggressive when memory moves between nodes.  Specifying (\fI\-H 0\fP) will
++cause numad to retain the system default value.  You can also make the hugepage
++daemon more or less aggressive by specifying an alternate value with this
++option.  For example, setting this value to 100ms (\fI\-H 100\fP) might improve
++the performance of workloads which use many transparent hugepages.
++.TP
+ \fB\-i\fR <\fI[min_interval:]max_interval\fP>
+ Sets the time interval that numad waits between system scans, in seconds to
+ <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
+ <\fImin_interval\fP> is 5 seconds.  Setting a <\fImax_interval\fP> of zero will
+ cause the daemon to exit.  (This is the normal mechanism to terminate the
+ daemon.)  A bigger <\fImax_interval\fP> will decrease numad overhead but also
+-decrease responsiveness to changing loads.
++decrease responsiveness to changing loads.  The default numad max_interval can
++be changed in the numad.conf file.
+ .TP
+ \fB\-K\fR <\fI0|1\fP>
+ This option controls whether numad keeps interleaved memory spread across NUMA
+@@ -82,10 +110,24 @@ a large, single-instance application tha
+ the workload will have continuous unpredictable memory access patterns (e.g. a
+ large in-memory database), you might get better results by specifying \fI\-K
+ 1\fP to instruct numad to keep interleaved memory distributed.
+-.TP
++.TP
+ \fB\-l\fR <\fIlog_level\fP>
+ Sets the log level to <\fIlog_level\fP>.  Reasonable choices are 5, 6, or 7.
+-The default value is 5.
++The default value is 5.  Note that CPU values are scaled by a factor of 100
++internally and in the numad log files.  Unfortunately, you don't actually have
++that many CPUs.
++.TP
++\fB\-m\fR  <\fItarget_memory_locality\fP>
++Set the desired memory locality threshold to stop moving process memory.  Numad
++might stop retrying to coalesce process memory when more than this percentage
++of the process's memory is already localized in the target node(s).  The
++default is 90%. Numad will frequently localize more than the localization
++threshold percent, but it will not necessarily do so.  Decrease the threshold
++to allow numad to leave more process memory distributed on various nodes.
++Increase the threshold to instruct numad to try to localize more memory.
++Acceptable values are between 50 and 100 percent.  Note that setting the target
++memory locality to 100% might cause numad to continually retry to move memory
++that the kernel will never succesfully move.
+ .TP
+ \fB\-p\fR <\fIPID\fP>
+ Add PID to explicit inclusion list of processes to consider for managing, if
+@@ -102,6 +144,12 @@ processes.  After daemon start, only one
+ process lists per subsequent numad invocation.  Use with \-S and \-p and \-x to
+ precisely control the scope of processes numad can manage.
+ .TP
++\fB\-R\fR <\fICPU_LIST\fP>
++Specify a list of CPUs that numad should assume are reserved for non-numad use.
++No processes will be bound to the specified CPUs by numad.  This option is
++effective only when starting numad.  You cannot change reserved CPUs
++dynamically while numad is already running.
++.TP
+ \fB\-S\fR <\fI0|1\fP>
+ This option controls whether numad scans all system processes or only the
+ processes on the explicit inclusion PID list.  The default is to scan all
+@@ -113,18 +161,30 @@ exclusion list).  Starting numad as
+ .br
+ will limit scanning, and thus also automatic NUMA management, to only those
+ three explicitly specified processes.
+-.TP
++.TP
++\fB\-t\fR  <\fIlogical_CPU_percent\fP>
++Specify the resource value of logical CPUs.  Hardware threads typically share
++most core resources, and so logical CPUs add only a fraction of CPU power for
++many workloads.  By default numad considers logical CPUs to be only 20 percent
++of a dedicated hardware core.
++.TP
+ \fB\-u\fR  <\fItarget_utilization\fP>
+ Set the desired maximum consumption percentage of a node. Default is 85%.
+ Decrease the target value to maintain more available resource margin on each
+ node.  Increase the target value to more exhaustively consume node resources.
+-.TP
++If you have sized your workloads to precisely fit inside a NUMA node,
++specifying (\fI\-u 100\fP) might improve system performance by telling numad to
++go ahead and consume all the resources in each node.  It is possible to specify
++values up to 130 percent to oversubscribe CPUs in the nodes, but memory
++utilization is always capped at 100%.  Use oversubscription values very
++carefully.
++.TP
+ \fB\-v\fR
+ Verbose output in log, sets the log level to LOG_INFO.  Same effect as \fI\-l 6\fP.
+-.TP
++.TP
+ \fB\-V\fR
+ Display version information and exit.
+-.TP
++.TP
+ \fB\-w\fR <\fINCPUS[:MB]\fP>
+ Queries numad for the best NUMA nodes to bind an entity that needs
+ <\fINCPUS\fP>.  The amount of memory (in MBs) is optional, but should normally
+@@ -145,32 +205,37 @@ Add PID to explicit exclusion list of pr
+ Multiple \fI\-x PID\fP options can be specified at daemon start, but after
+ daemon start, only one PID can be added to the exclusion list per subsequent
+ numad invocation.  Use with \-S to precisely control the scope of processes
+-numad can manage.
++numad can manage.
+ .SH "FILES"
+-.LP
+-\fI/usr/bin/numad\fP
+-.br
+-\fI/var/log/numad.log\fP
+-.br
+-\fI/var/run/numad.pid\fP
++.LP
++\fI/usr/bin/numad\fP
++.br
++\fI/etc/numad.conf\fP
++.br
++\fI/var/log/numad.log\fP
++.br
++\fI/var/run/numad.pid\fP
+ .SH "ENVIRONMENT VARIABLES"
+-.LP
+-.TP
++.LP
++.TP
+ None.
+ .SH "EXAMPLES"
+-.LP
+-Numad is normally run as a system daemon and should be managed by the
++.LP
++Numad can be run as a system daemon and can be managed by the
+ standard init mechanisms of the host.
+-.LP
++.LP
+ If interactive (manual) control is desired, you can start the daemon manually by typing:
+-.LP
++.LP
+ /usr/bin/numad
+ .LP
+-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
++Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
++.LP
++You can terminate numad from running by typing:
++.LP
++/usr/bin/numad -i0
+ .SH "AUTHORS"
+-.LP
++.LP
+ Bill Gray <bgray@redhat.com>
+ .SH "SEE ALSO"
+-.LP
++.LP
+ numactl(8)
+-
+diff -rup numad-0.5git/numad.c numad-0.5git-new/numad.c
+--- numad-0.5git/numad.c	2012-12-03 15:40:40.000000000 +0100
++++ numad-0.5git-new/numad.c	2016-08-30 08:45:19.000000000 +0200
+@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
+ */
+
+
+-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
++// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
+
+
+ #define _GNU_SOURCE
+@@ -40,6 +40,10 @@ Inc., 59 Temple Place, Suite 330, Boston
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
++#include <time.h>
++#include <unistd.h>
++#include <values.h>
++
+ #include <sys/ipc.h>
+ #include <sys/mman.h>
+ #include <sys/msg.h>
+@@ -49,26 +53,16 @@ Inc., 59 Temple Place, Suite 330, Boston
+ #include <sys/syslog.h>
+ #include <sys/time.h>
+ #include <sys/types.h>
+-#include <time.h>
+-#include <unistd.h>
+-#include <values.h>
++
++#include <asm/unistd.h>
+
+
+-#define VERSION_STRING "20121130"
++#define VERSION_STRING "20150602"
+
+
+ #define VAR_RUN_FILE "/var/run/numad.pid"
+ #define VAR_LOG_FILE "/var/log/numad.log"
+
+-char *cpuset_dir = NULL;
+-char *cpuset_dir_list[] =  {
+-    NULL,
+-    "/sys/fs/cgroup/cpuset",
+-    "/cgroup/cpuset",
+-    NULL
+-};
+-
+-
+ #define KILOBYTE (1024)
+ #define MEGABYTE (1024 * 1024)
+
+@@ -86,14 +80,11 @@ char *cpuset_dir_list[] =  {
+ #define MAX_INTERVAL 15
+ #define CPU_THRESHOLD     50
+ #define MEMORY_THRESHOLD 300
+-#define TARGET_UTILIZATION_PERCENT 85
+-#define IMPROVEMENT_THRESHOLD_PERCENT 5
+-
++#define DEFAULT_HTT_PERCENT 20
++#define DEFAULT_THP_SCAN_SLEEP_MS 1000
++#define DEFAULT_UTILIZATION_PERCENT 85
++#define DEFAULT_MEMLOCALITY_PERCENT 90
+
+-#define ELIM_NEW_LINE(s) \
+-    if (s[strlen(s) - 1] == '\n') { \
+-        s[strlen(s) - 1] = '\0'; \
+-    }
+
+ #define CONVERT_DIGITS_TO_NUM(p, n) \
+     n = *p++ - '0'; \
+@@ -105,19 +96,36 @@ char *cpuset_dir_list[] =  {
+
+ int num_cpus = 0;
+ int num_nodes = 0;
+-int page_size_in_bytes = 0;
+-int huge_page_size_in_bytes = 0;
++int threads_per_core = 0;
++uint64_t page_size_in_bytes = 0;
++uint64_t huge_page_size_in_bytes = 0;
+
+ int min_interval = MIN_INTERVAL;
+ int max_interval = MAX_INTERVAL;
+-int target_utilization  = TARGET_UTILIZATION_PERCENT;
++int htt_percent = DEFAULT_HTT_PERCENT;
++int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS;
++int target_utilization  = DEFAULT_UTILIZATION_PERCENT;
++int target_memlocality  = DEFAULT_MEMLOCALITY_PERCENT;
+ int scan_all_processes = 1;
+ int keep_interleaved_memory = 0;
++int use_inactive_file_cache = 1;
+
+ pthread_mutex_t pid_list_mutex;
+ pthread_mutex_t node_info_mutex;
++long sum_CPUs_total = 0;
+ int requested_mbs = 0;
+ int requested_cpus = 0;
++int got_sighup = 0;
++int got_sigterm = 0;
++int got_sigquit = 0;
++
++void sig_handler(int signum) {
++    switch (signum) {
++        case SIGHUP:  got_sighup  = 1; break;
++        case SIGTERM: got_sigterm = 1; break;
++        case SIGQUIT: got_sigquit = 1; break;
++    }
++}
+
+
+
+@@ -139,7 +147,7 @@ void numad_log(int level, const char *fm
+     }
+     char buf[BUF_SIZE];
+     time_t ts = time(NULL);
+-    sprintf(buf, ctime(&ts));
++    strncpy(buf, ctime(&ts), sizeof(buf));
+     char *p = &buf[strlen(buf) - 1];
+     *p++ = ':';
+     *p++ = ' ';
+@@ -155,13 +163,16 @@ void open_log_file() {
+     log_fs = fopen(VAR_LOG_FILE, "a");
+     if (log_fs == NULL) {
+         log_fs = stderr;
+-        numad_log(LOG_ERR, "Cannot open numad log file -- using stderr\n");
++        numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno);
+     }
+ }
+
++
+ void close_log_file() {
+     if (log_fs != NULL) {
+-        fclose(log_fs);
++        if (log_fs != stderr) {
++            fclose(log_fs);
++        }
+         log_fs = NULL;
+     }
+ }
+@@ -235,23 +246,32 @@ void send_msg(long dst_pid, long cmd, lo
+
+
+ typedef struct id_list {
+-    // Use CPU_SET(3) <sched.h> cpuset bitmasks,
++    // Use CPU_SET(3) <sched.h> bitmasks,
+     // but bundle size and pointer together
+     // and genericize for both CPU and Node IDs
+     cpu_set_t *set_p;
+     size_t bytes;
+ } id_list_t, *id_list_p;
+
+-#define INIT_ID_LIST(list_p) \
++#define ID_LIST_SET_P(list_p) (list_p->set_p)
++#define ID_LIST_BYTES(list_p) (list_p->bytes)
++
++#define INIT_ID_LIST(list_p, num_elements) \
+     list_p = malloc(sizeof(id_list_t)); \
+     if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
+-    list_p->set_p = CPU_ALLOC(num_cpus); \
++    list_p->set_p = CPU_ALLOC(num_elements); \
+     if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
+-    list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
++    list_p->bytes = CPU_ALLOC_SIZE(num_elements);
+
+-#define CLEAR_LIST(list_p) \
++#define CLEAR_CPU_LIST(list_p) \
+     if (list_p == NULL) { \
+-        INIT_ID_LIST(list_p); \
++        INIT_ID_LIST(list_p, num_cpus); \
++    } \
++    CPU_ZERO_S(list_p->bytes, list_p->set_p)
++
++#define CLEAR_NODE_LIST(list_p) \
++    if (list_p == NULL) { \
++        INIT_ID_LIST(list_p, num_nodes); \
+     } \
+     CPU_ZERO_S(list_p->bytes, list_p->set_p)
+
+@@ -262,6 +282,9 @@ typedef struct id_list {
+         list_p = NULL; \
+     }
+
++#define COPY_LIST(orig_list_p, copy_list_p) \
++    memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
++
+ #define NUM_IDS_IN_LIST(list_p)     CPU_COUNT_S(list_p->bytes, list_p->set_p)
+ #define ADD_ID_TO_LIST(k, list_p)  CPU_SET_S(k, list_p->bytes, list_p->set_p)
+ #define CLR_ID_IN_LIST(k, list_p)  CPU_CLR_S(k, list_p->bytes, list_p->set_p)
+@@ -272,6 +295,25 @@ typedef struct id_list {
+ #define  OR_LISTS( or_list_p, list_1_p, list_2_p)  CPU_OR_S( or_list_p->bytes,  or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
+ #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
+
++int negate_cpu_list(id_list_p list_p) {
++    if (list_p == NULL) {
++        numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
++        exit(EXIT_FAILURE);
++    }
++    if (num_cpus < 1) {
++        numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
++        exit(EXIT_FAILURE);
++    }
++    for (int ix = 0;  (ix < num_cpus);  ix++) {
++        if (ID_IS_IN_LIST(ix, list_p)) {
++            CLR_ID_IN_LIST(ix, list_p);
++        } else {
++            ADD_ID_TO_LIST(ix, list_p);
++        }
++    }
++    return NUM_IDS_IN_LIST(list_p);
++}
++
+ int add_ids_to_list_from_str(id_list_p list_p, char *s) {
+     if (list_p == NULL) {
+         numad_log(LOG_CRIT, "Cannot add to NULL list\n");
+@@ -352,9 +394,21 @@ typedef struct node_data {
+     uint8_t *distance;
+     id_list_p cpu_list_p;
+ } node_data_t, *node_data_p;
+-
+ node_data_p node = NULL;
+
++int min_node_CPUs_free_ix = -1;
++int min_node_MBs_free_ix = -1;
++long min_node_CPUs_free = MAXINT;
++long min_node_MBs_free = MAXINT;
++long max_node_CPUs_free = 0;
++long max_node_MBs_free = 0;
++long avg_node_CPUs_free = 0;
++long avg_node_MBs_free = 0;
++double stddev_node_CPUs_free = 0.0;
++double stddev_node_MBs_free = 0.0;
++
++
++
+ // RING_BUF_SIZE must be a power of two
+ #define RING_BUF_SIZE 8
+
+@@ -366,14 +420,15 @@ typedef struct process_data {
+     uint64_t data_time_stamp; // hundredths of seconds
+     uint64_t bind_time_stamp;
+     uint64_t num_threads;
++    uint64_t MBs_size;
+     uint64_t MBs_used;
+     uint64_t cpu_util;
+     uint64_t CPUs_used;  // scaled * ONE_HUNDRED
+     uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
+     int ring_buf_ix;
+-    int dup_bind_count;
+     char *comm;
+-    char *cpuset_name;
++    id_list_p node_list_p;
++    uint64_t *process_MBs;
+ } process_data_t, *process_data_p;
+
+
+@@ -433,7 +488,8 @@ int process_hash_insert(int pid) {
+ }
+
+ int process_hash_update(process_data_p newp) {
+-    // This updates hash table stats for processes we are monitoring
++    // This updates hash table stats for processes we are monitoring. Only the
++    // scalar resource consumption stats need to be updated here.
+     int new_hash_table_entry = 1;
+     int ix = process_hash_insert(newp->pid);
+     if (ix >= 0) {
+@@ -460,6 +516,7 @@ int process_hash_update(process_data_p n
+             }
+             p->comm = strdup(newp->comm);
+         }
++        p->MBs_size = newp->MBs_size;
+         p->MBs_used = newp->MBs_used;
+         p->cpu_util = newp->cpu_util;
+         p->num_threads = newp->num_threads;
+@@ -468,6 +525,11 @@ int process_hash_update(process_data_p n
+     return new_hash_table_entry;
+ }
+
++void process_hash_clear_all_bind_time_stamps() {
++    for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
++        process_hash_table[ix].bind_time_stamp = 0;
++    }
++}
+
+ int process_hash_rehash(int old_ix) {
+     // Given the index of a table entry that would otherwise be orphaned by
+@@ -489,7 +551,8 @@ int process_hash_remove(int pid) {
+         // remove the target
+         process_data_p dp = &process_hash_table[ix];
+         if (dp->comm) { free(dp->comm); }
+-        if (dp->cpuset_name) { free(dp->cpuset_name); }
++        if (dp->process_MBs) { free(dp->process_MBs); }
++        FREE_LIST(dp->node_list_p);
+         memset(dp, 0, sizeof(process_data_t));
+         // bubble up the collision chain and rehash if neeeded
+         for (;;) {
+@@ -543,15 +606,15 @@ void process_hash_table_dump() {
+         process_data_p p = &process_hash_table[ix];
+         if (p->pid) {
+             numad_log(LOG_DEBUG,
+-                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld Data TS: %ld  Bind TS: %ld\n",
++                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld/%ld Data TS: %ld  Bind TS: %ld\n",
+                 ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
+-                p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
++                p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
++            // FIXME: make this dump every field, but this is not even currently used
+         }
+     }
+ }
+
+ void process_hash_table_cleanup(uint64_t update_time) {
+-    int cpusets_removed = 0;
+     int num_hash_entries_used = 0;
+     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+         process_data_p p = &process_hash_table[ix];
+@@ -562,34 +625,14 @@ void process_hash_table_cleanup(uint64_t
+                 p->data_time_stamp = 0;
+                 p->CPUs_used = 0;
+                 // Check for dead pids and remove them...
+-                char fname[FNAME_SIZE];
+-                snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
+-                if (access(fname, F_OK) < 0) {
+-                    // Seems dead.  Forget this pid -- after first checking
+-                    // and removing obsolete numad.PID cpuset directories.
+-                    snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
+-                    if (access(fname, F_OK) == 0) {
+-                        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
+-                        int rc = rmdir(fname);
+-                        if (rc >= 0) {
+-                            cpusets_removed += 1;
+-                        } else {
+-                            numad_log(LOG_ERR, "bad cpuset rmdir\n");
+-                            // exit(EXIT_FAILURE);
+-                        }
+-                    }
++                if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
++                    // Seems dead.  Forget this pid
+                     process_hash_remove(p->pid);
+                     num_hash_entries_used -= 1;
+                 }
+             }
+         }
+     }
+-    if (cpusets_removed > 0) {
+-        // Expire all the duplicate bind counts so things will be re-evaluated sooner.
+-        for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+-            process_hash_table[ix].dup_bind_count = 0;
+-        }
+-    }
+     // Keep hash table approximately half empty
+     if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
+         process_hash_table_expand();
+@@ -610,9 +653,7 @@ pid_list_p insert_pid_into_pid_list(pid_
+     if (process_hash_table != NULL) {
+         int hash_ix = process_hash_lookup(pid);
+         if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
+-            // Clear dup_bind_count and interleaved flag,
+-            // in case user wants it to be re-evaluated soon
+-            process_hash_table[hash_ix].dup_bind_count = 0;
++            // Clear interleaved flag, in case user wants it to be re-evaluated
+             process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
+         }
+     }
+@@ -678,18 +719,23 @@ void print_version_and_exit(char *prog_n
+
+ void print_usage_and_exit(char *prog_name) {
+     fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
++    fprintf(stderr, "-C 1  to count inactive file cache as available memory (default 1)\n");
++    fprintf(stderr, "-C 0  to count inactive file cache memory as unavailable (default 1)\n");
+     fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
+-    fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
+     fprintf(stderr, "-h to print this usage info\n");
++    fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS);
+     fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
+-    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes\n");
+-    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes\n");
+-    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
++    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes (default 0)\n");
++    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes (default 0)\n");
++    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
++    fprintf(stderr, "-m <N> to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT);
+     fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
+     fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
+-    fprintf(stderr, "-S 1  to scan all processes\n");
+-    fprintf(stderr, "-S 0  to scan only explicit PID list processes\n");
+-    fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
++    fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
++    fprintf(stderr, "-S 1  to scan all processes (default 1)\n");
++    fprintf(stderr, "-S 0  to scan only explicit PID list processes (default 1)\n");
++    fprintf(stderr, "-t <N> to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT);
++    fprintf(stderr, "-u <N> to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT);
+     fprintf(stderr, "-v for verbose  (same effect as '-l 6')\n");
+     fprintf(stderr, "-V to show version info\n");
+     fprintf(stderr, "-w <CPUs>[:<MBs>] for NUMA node suggestions\n");
+@@ -698,62 +744,35 @@ void print_usage_and_exit(char *prog_nam
+ }
+
+
+-void check_prereqs(char *prog_name) {
+-    // Verify cpusets are available on this system.
+-    char **dir = &cpuset_dir_list[0];
+-    if (*dir == NULL) { dir++; }
+-    while (*dir != NULL) {
+-        cpuset_dir = *dir;
+-        char fname[FNAME_SIZE];
+-        snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_dir);
+-        if (access(fname, F_OK) == 0) {
+-            break;
+-        }
+-        dir++;
+-    }
+-    if (*dir == NULL) {
+-        fprintf(stderr, "\n");
+-        fprintf(stderr, "Are CPUSETs enabled on this system?\n");
+-        fprintf(stderr, "They are required for %s to function.\n\n", prog_name);
+-        fprintf(stderr, "Check manpage CPUSET(7). You might need to do something like:\n");
+-        fprintf(stderr, "    # mkdir <DIRECTORY_MOUNT_POINT>\n");
+-        fprintf(stderr, "    # mount cgroup -t cgroup -o cpuset <DIRECTORY_MOUNT_POINT>\n");
+-        fprintf(stderr, "    where <DIRECTORY_MOUNT_POINT> is something like:\n");
+-        dir = &cpuset_dir_list[0];
+-        if (*dir == NULL) { dir++; }
+-        while (*dir != NULL) {
+-            fprintf(stderr, "      - %s\n", *dir);
+-            dir++;
+-        }
+-        fprintf(stderr, "and then try again...\n");
+-        fprintf(stderr, "Or, use '-D <DIRECTORY_MOUNT_POINT>' to specify the correct mount point\n");
+-        fprintf(stderr, "\n");
+-        exit(EXIT_FAILURE);
++void set_thp_scan_sleep_ms(int new_ms) {
++    if (new_ms < 1) {
++        // 0 means do not change the system default
++        return;
+     }
+-    // Check on THP scan sleep time.
+-    char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
+-    int fd = open(thp_scan_fname, O_RDONLY, 0);
++    char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
++    int fd = open(thp_scan_fname, O_RDWR, 0);
+     if (fd >= 0) {
+-        int ms;
+         char buf[BUF_SIZE];
+         int bytes = read(fd, buf, BUF_SIZE);
+-        close(fd);
+         if (bytes > 0) {
++            buf[bytes] = '\0';
++            int cur_ms;
+             char *p = buf;
+-            CONVERT_DIGITS_TO_NUM(p, ms);
+-            if (ms > 150) {
+-                fprintf(stderr, "\n");
+-                numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
+-                fprintf(stderr,       "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
+-                fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
+-                fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
+-                fprintf(stderr, "to more aggressively (re)construct THPs.  For example:\n");
+-                fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
+-                fprintf(stderr, "\n");
++            CONVERT_DIGITS_TO_NUM(p, cur_ms);
++            if (cur_ms != new_ms) {
++                lseek(fd, 0, SEEK_SET);
++                numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
++                sprintf(buf, "%d\n", new_ms);
++                write(fd, buf, strlen(buf));
+             }
+         }
++        close(fd);
+     }
+-    // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
++}
++
++void check_prereqs(char *prog_name) {
++    // Adjust kernel tunable to scan for THP more frequently...
++    set_thp_scan_sleep_ms(thp_scan_sleep_ms);
+ }
+
+
+@@ -785,7 +804,6 @@ int get_daemon_pid() {
+     return pid;
+ }
+
+-
+ int register_numad_pid() {
+     int pid;
+     char buf[BUF_SIZE];
+@@ -831,6 +849,43 @@ fail_numad_run_file:
+ }
+
+
++int count_set_bits_in_hex_list_file(char *fname) {
++    int sum = 0;
++    int fd = open(fname, O_RDONLY, 0);
++    if (fd >= 0) {
++        char buf[BUF_SIZE];
++        int bytes = read(fd, buf, BUF_SIZE);
++        close(fd);
++        for (int ix = 0;  (ix < bytes);  ix++) {
++            char c = tolower(buf[ix]);
++            switch (c) {
++                case '0'  : sum += 0; break;
++                case '1'  : sum += 1; break;
++                case '2'  : sum += 1; break;
++                case '3'  : sum += 2; break;
++                case '4'  : sum += 1; break;
++                case '5'  : sum += 2; break;
++                case '6'  : sum += 2; break;
++                case '7'  : sum += 3; break;
++                case '8'  : sum += 1; break;
++                case '9'  : sum += 2; break;
++                case 'a'  : sum += 2; break;
++                case 'b'  : sum += 3; break;
++                case 'c'  : sum += 2; break;
++                case 'd'  : sum += 3; break;
++                case 'e'  : sum += 3; break;
++                case 'f'  : sum += 4; break;
++                case ' '  : sum += 0; break;
++                case ','  : sum += 0; break;
++                case '\n' : sum += 0; break;
++                default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
++            }
++        }
++    }
++    return sum;
++}
++
++
+ int get_num_cpus() {
+     int n1 = sysconf(_SC_NPROCESSORS_CONF);
+     int n2 = sysconf(_SC_NPROCESSORS_ONLN);
+@@ -848,7 +903,7 @@ int get_num_cpus() {
+ int get_num_kvm_vcpu_threads(int pid) {
+     // Try to return the number of vCPU threads for this VM guest,
+     // excluding the IO threads.  All failures return MAXINT.
+-    // FIXME: figure out some better way to do this...
++    // FIXME: someday figure out some better way to do this...
+     char fname[FNAME_SIZE];
+     snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid);
+     int fd = open(fname, O_RDONLY, 0);
+@@ -876,8 +931,8 @@ int get_num_kvm_vcpu_threads(int pid) {
+ }
+
+
+-int get_huge_page_size_in_bytes() {
+-    int huge_page_size = 0;;
++uint64_t get_huge_page_size_in_bytes() {
++    uint64_t huge_page_size = 0;;
+     FILE *fs = fopen("/proc/meminfo", "r");
+     if (!fs) {
+         numad_log(LOG_CRIT, "Can't open /proc/meminfo\n");
+@@ -890,7 +945,7 @@ int get_huge_page_size_in_bytes() {
+             while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) {
+                 p++;
+             }
+-            huge_page_size = atoi(p);
++            huge_page_size = atol(p);
+             break;
+         }
+     }
+@@ -916,143 +971,134 @@ static int name_starts_with_digit(const
+ }
+
+
+-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
+-    // Check basic parameter validity.
+-    if (pid <= 0) {
++
++#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long))
++#define   SET_BIT(i,a)   (a)[(i) / BITS_IN_LONG] |=  (1u << ((i) % BITS_IN_LONG))
++#define  TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] &   (1u << ((i) % BITS_IN_LONG))) != 0)
++#define CLEAR_BIT(i,a)   (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG))
++
++int bind_process_and_migrate_memory(process_data_p p) {
++    uint64_t t0 = get_time_stamp();
++    // Parameter p is a pointer to an element in the hash table
++    if ((!p) || (p->pid < 1)) {
+         numad_log(LOG_CRIT, "Bad PID to bind\n");
+         exit(EXIT_FAILURE);
+     }
+-    if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
+-        numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
+-        exit(EXIT_FAILURE);
+-    }
+-    int nodes;
+-    if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
+-        numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
++    if (!p->node_list_p) {
++        numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
+         exit(EXIT_FAILURE);
+     }
+-    // Cpu_list_p is optional and may be NULL...
+-    // Generate CPU id list from the specified node list if necessary
+-    if (cpu_list_p == NULL) {
+-        static id_list_p tmp_cpu_list_p;
+-        CLEAR_LIST(tmp_cpu_list_p);
+-        int node_id = 0;
+-        while (nodes) {
+-            if (ID_IS_IN_LIST(node_id, node_list_p)) {
+-                OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
+-                nodes -= 1;
+-            }
+-            node_id += 1;
+-        }
+-        cpu_list_p = tmp_cpu_list_p;
+-    }
+-    // Make the cpuset directory if necessary
+-    char cpuset_name_buf[FNAME_SIZE];
+-    snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
+-    char *p = &cpuset_name_buf[strlen(cpuset_dir)];
+-    if (!strcmp(p, "/")) {
+-        // Make a cpuset directory for this process
+-        snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
+-        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
+-        int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+-        if (rc == -1) {
+-            numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
+-            return 0;
++    // Generate CPU list derived from target node list.
++    static id_list_p cpu_bind_list_p;
++    CLEAR_CPU_LIST(cpu_bind_list_p);
++    int nodes = NUM_IDS_IN_LIST(p->node_list_p);
++    int node_id = 0;
++    while (nodes) {
++        if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
++            OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
++            nodes -= 1;
+         }
++        node_id += 1;
+     }
+-    cpuset_name = cpuset_name_buf;
+-    // Now that we have a cpuset for pid and a populated cpulist,
+-    // start the actual binding and migration.
+-    uint64_t t0 = get_time_stamp();
+-
+-    // Write "1" out to cpuset.memory_migrate file
+     char fname[FNAME_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
+-    int fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
+-        return 0;
+-    }
+-    write(fd, "1", 1);
+-    close(fd);
+-
+-    // Write node IDs out to cpuset.mems file
+-    char node_list_buf[BUF_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
+-        return 0;
+-    }
+-    int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
+-    write(fd, node_list_buf, len);
+-    close(fd);
+-
+-    // Write CPU IDs out to cpuset.cpus file
+-    char cpu_list_buf[BUF_SIZE];
+-    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
+-        return 0;
+-    }
+-    len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
+-    write(fd, cpu_list_buf, len);
+-    close(fd);
+-
+-    // Copy pid tasks one at a time to tasks file
+-    snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
+-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
+-    if (fd == -1) {
+-        numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
+-        return 0;
+-    }
+-    snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
+     struct dirent **namelist;
+-    int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
+-    if (files < 0) {
+-        numad_log(LOG_WARNING, "Could not scandir task list\n");
++    snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
++    int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
++    if (num_tasks <= 0) {
++        numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
+         return 0;  // Assume the process terminated
+     }
+-    for (int ix = 0;  (ix < files);  ix++) {
+-        // copy pid tasks, one at a time
+-        numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
+-        write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
+-        free(namelist[ix]);
++    // Set the affinity of each task in the process...
++    for (int namelist_ix = 0;  (namelist_ix < num_tasks);  namelist_ix++) {
++        int tid = atoi(namelist[namelist_ix]->d_name);
++        int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p));
++        if (rc < 0) {
++            // Check errno
++            if (errno == ESRCH) {
++                numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid);
++            }
++            numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno);
++        }
++        free(namelist[namelist_ix]);
+     }
+     free(namelist);
+-    close(fd);
+-
+-    uint64_t t1 = get_time_stamp();
++    // Now move the memory to the target nodes....
++    static unsigned long *dest_mask;
++    static unsigned long *from_mask;
++    static int allocated_bytes_in_masks;
++    // Lie about num_nodes being one bigger because of kernel bug...
++    int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long);
++    if (allocated_bytes_in_masks < num_bytes_in_masks) {
++        allocated_bytes_in_masks = num_bytes_in_masks;
++        dest_mask = realloc(dest_mask, num_bytes_in_masks);
++        from_mask = realloc(from_mask, num_bytes_in_masks);
++        if ((dest_mask == NULL) || (from_mask == NULL)) {
++            numad_log(LOG_CRIT, "bit mask malloc failed\n");
++            exit(EXIT_FAILURE);
++        }
++    }
++    // In an effort to put semi-balanced memory in each target node, move the
++    // contents from the source node with the max amount of memory to the
++    // destination node with the least amount of memory.  Repeat until done.
++    int prev_from_node_id = -1;
++    for (;;) {
++        int min_dest_node_id = -1;
++        int max_from_node_id = -1;
++        for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
++            node_id = node[node_ix].node_id;
++            if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
++                if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) {
++                    // The ">=" above is intentional, so we tend to move memory to higher numbered nodes
++                    min_dest_node_id = node_id;
++                }
++            } else {
++                if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) {
++                    max_from_node_id = node_id;
++                }
++            }
++        }
++        if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) {
++            break;
++        }
++        memset(dest_mask, 0, num_bytes_in_masks);
++        memset(from_mask, 0, num_bytes_in_masks);
++        SET_BIT(max_from_node_id, from_mask);
++        SET_BIT(min_dest_node_id, dest_mask);
++        numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id);
++        // Lie about num_nodes being one bigger because of kernel bug...
++        int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask);
++        if (rc > 2) {
++            // rc == the number of pages that could not be moved.
++            // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2.
++            numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc);
++        } else if (rc < 0) {
++            // Check errno
++            if (errno == ESRCH) {
++                numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid);
++                return 0;  // Assume the process terminated
++            }
++        }
++        // Assume memory did move for current accounting purposes...
++        p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id];
++        p->process_MBs[max_from_node_id] = 0;
++        prev_from_node_id = max_from_node_id;
++    }
+     // Check pid still active
+-    snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
++    snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
+     if (access(fname, F_OK) < 0) {
+-        numad_log(LOG_WARNING, "Could not migrate pid\n");
+-        return 0;  // Assume the process terminated
++        numad_log(LOG_WARNING, "Could not migrate pid %d.  Apparently it went away.\n", p->pid);
++        return 0;
++    } else {
++        uint64_t t1 = get_time_stamp();
++        p->bind_time_stamp = t1;
++        char node_list_str[BUF_SIZE];
++        str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p);
++        numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100);
++        return 1;
+     }
+-    numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
+-    return 1;
+ }
+
+
+-void show_nodes() {
+-    time_t ts = time(NULL);
+-    fprintf(log_fs, "%s", ctime(&ts));
+-    fprintf(log_fs, "Nodes: %d\n", num_nodes);
+-    for (int ix = 0;  (ix < num_nodes);  ix++) {
+-        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ",
+-            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
+-        for (int d = 0;  (d < num_nodes);  d++) {
+-            fprintf(log_fs, "%d ", node[ix].distance[d]);
+-        }
+-        char buf[BUF_SIZE];
+-        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
+-        fprintf(log_fs, " CPUs: %s\n", buf);
+-    }
+-    fprintf(log_fs, "\n");
+-    fflush(log_fs);
+-}
+-
+
+ typedef struct cpu_data {
+     uint64_t time_stamp;
+@@ -1062,10 +1108,9 @@ typedef struct cpu_data {
+ cpu_data_t cpu_data_buf[2];  // Two sets, to calc deltas
+ int cur_cpu_data_buf = 0;
+
+-
+ void update_cpu_data() {
+     // Parse idle percents from CPU stats in /proc/stat cpu<N> lines
+-    static FILE *fs = NULL;
++    static FILE *fs;
+     if (fs != NULL) {
+         rewind(fs);
+     } else {
+@@ -1107,14 +1152,14 @@ void update_cpu_data() {
+             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip nice
+             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip system
+             while (!isdigit(*p)) { p++; }
+-            uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
++            uint64_t idle;
++            CONVERT_DIGITS_TO_NUM(p, idle);
+             cpu_data_buf[new].idle[cpu_id] = idle;
+         }
+     }
+     cur_cpu_data_buf = new;
+ }
+
+-
+ int node_and_digits(const struct dirent *dptr) {
+     char *p = (char *)(dptr->d_name);
+     if (*p++ != 'n') return 0;
+@@ -1129,10 +1174,31 @@ int node_and_digits(const struct dirent
+ }
+
+
++uint64_t node_info_time_stamp = 0;
+ id_list_p all_cpus_list_p = NULL;
+ id_list_p all_nodes_list_p = NULL;
+-uint64_t node_info_time_stamp = 0;
++id_list_p reserved_cpu_mask_list_p = NULL;
++char *reserved_cpu_str = NULL;
+
++void show_nodes() {
++    fprintf(log_fs, "\n");
++    numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
++    fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n",
++        min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
++    fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n",
++        min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
++    for (int ix = 0;  (ix < num_nodes);  ix++) {
++        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ",
++            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
++        for (int d = 0;  (d < num_nodes);  d++) {
++            fprintf(log_fs, "%d ", node[ix].distance[d]);
++        }
++        char buf[BUF_SIZE];
++        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
++        fprintf(log_fs, " CPUs: %s\n", buf);
++    }
++    fflush(log_fs);
++}
+
+ int update_nodes() {
+     char fname[FNAME_SIZE];
+@@ -1141,6 +1207,7 @@ int update_nodes() {
+     uint64_t time_stamp = get_time_stamp();
+ #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
+     if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
++        node_info_time_stamp = time_stamp;
+         // Count directory names of the form: /sys/devices/system/node/node<N>
+         struct dirent **namelist;
+         int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
+@@ -1167,8 +1234,15 @@ int update_nodes() {
+             }
+             num_nodes = num_files;
+         }
+-        CLEAR_LIST(all_cpus_list_p);
+-        CLEAR_LIST(all_nodes_list_p);
++        sum_CPUs_total = 0;
++        CLEAR_CPU_LIST(all_cpus_list_p);
++        CLEAR_NODE_LIST(all_nodes_list_p);
++        // Figure out how many threads per core there are (for later discounting of hyper-threads)
++        threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
++        if (threads_per_core < 1) {
++            numad_log(LOG_CRIT, "Could not count threads per core\n");
++            exit(EXIT_FAILURE);
++        }
+         // For each "node<N>" filename present, save <N> in node[ix].node_id
+         // Note that the node id might not necessarily match the node ix.
+         // Also populate the cpu lists and distance vectors for this node.
+@@ -1184,11 +1258,24 @@ int update_nodes() {
+             snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id);
+             int fd = open(fname, O_RDONLY, 0);
+             if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
++                buf[BIG_BUF_SIZE - 1] = '\0';
+                 // get cpulist from the cpulist string
+-                CLEAR_LIST(node[node_ix].cpu_list_p);
++                CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
+                 int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
++                if (reserved_cpu_str != NULL) {
++                    AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
++                    n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
++                }
+                 OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
+-                node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                // Calculate total CPUs, but possibly discount hyper-threads
++                if ((threads_per_core == 1) || (htt_percent >= 100)) {
++                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                } else {
++                    n /= threads_per_core;
++                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
++                    node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
++                }
++                sum_CPUs_total += node[node_ix].CPUs_total;
+                 close(fd);
+             } else {
+                 numad_log(LOG_CRIT, "Could not get node cpu list\n");
+@@ -1220,15 +1307,30 @@ int update_nodes() {
+         }
+         free(namelist);
+     }
+-    // Second, get the dynamic free memory and available CPU capacity
++    // Second, update the dynamic free memory and available CPU capacity
++    while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
++        // Make sure at least 7/100 of a second has passed.
++        // Otherwise sleep for 1/10 second.
++	struct timespec ts = { 0, 100000000 };
++	nanosleep(&ts, &ts);
++	time_stamp = get_time_stamp();
++    }
+     update_cpu_data();
++    max_node_MBs_free = 0;
++    max_node_CPUs_free = 0;
++    min_node_MBs_free = MAXINT;
++    min_node_CPUs_free = MAXINT;
++    uint64_t sum_of_node_MBs_free = 0;
++    uint64_t sum_of_node_CPUs_free = 0;
+     for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
+         int node_id = node[node_ix].node_id;
+         // Get available memory info from node<N>/meminfo file
+         snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
+         int fd = open(fname, O_RDONLY, 0);
+         if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
++            close(fd);
+             uint64_t KB;
++            buf[BIG_BUF_SIZE - 1] = '\0';
+             char *p = strstr(buf, "MemTotal:");
+             if (p != NULL) {
+                 p += 9;
+@@ -1238,7 +1340,11 @@ int update_nodes() {
+             }
+             while (!isdigit(*p)) { p++; }
+             CONVERT_DIGITS_TO_NUM(p, KB);
+-            node[node_ix].MBs_total = KB / KILOBYTE;
++            node[node_ix].MBs_total = (KB / KILOBYTE);
++            if (node[node_ix].MBs_total < 1) {
++                // If a node has zero memory, remove it from the all_nodes_list...
++                CLR_ID_IN_LIST(node_id, all_nodes_list_p);
++            }
+             p = strstr(p, "MemFree:");
+             if (p != NULL) {
+                 p += 8;
+@@ -1248,8 +1354,28 @@ int update_nodes() {
+             }
+             while (!isdigit(*p)) { p++; }
+             CONVERT_DIGITS_TO_NUM(p, KB);
+-            node[node_ix].MBs_free = KB / KILOBYTE;
+-            close(fd);
++            node[node_ix].MBs_free = (KB / KILOBYTE);
++            if (use_inactive_file_cache) {
++                // Add inactive file cache quantity to "free" memory
++                p = strstr(p, "Inactive(file):");
++                if (p != NULL) {
++                    p += 15;
++                } else {
++                    numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
++                    exit(EXIT_FAILURE);
++                }
++                while (!isdigit(*p)) { p++; }
++                CONVERT_DIGITS_TO_NUM(p, KB);
++                node[node_ix].MBs_free += (KB / KILOBYTE);
++            }
++            sum_of_node_MBs_free += node[node_ix].MBs_free;
++            if (min_node_MBs_free > node[node_ix].MBs_free) {
++                min_node_MBs_free = node[node_ix].MBs_free;
++                min_node_MBs_free_ix = node[node_ix].node_id;
++            }
++            if (max_node_MBs_free < node[node_ix].MBs_free) {
++                max_node_MBs_free = node[node_ix].MBs_free;
++            }
+         } else {
+             numad_log(LOG_CRIT, "Could not get node meminfo\n");
+             exit(EXIT_FAILURE);
+@@ -1260,7 +1386,8 @@ int update_nodes() {
+         if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
+             uint64_t idle_ticks = 0;
+             int cpu = 0;
+-            int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
++            int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
++            int num_cpus_to_process = num_lcpus;
+             while (num_cpus_to_process) {
+                 if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
+                     idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
+@@ -1274,15 +1401,46 @@ int update_nodes() {
+             // printf("Node: %d   CPUs: %ld   time diff %ld   Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
+             // assert(time_diff > 0);
+             node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
++            // Possibly discount hyper-threads
++            if ((threads_per_core > 1) && (htt_percent < 100)) {
++                uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
++                if (node[node_ix].CPUs_free > htt_discount) {
++                    node[node_ix].CPUs_free -= htt_discount;
++                } else {
++                    node[node_ix].CPUs_free = 0;
++                }
++            }
+             if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
+                 node[node_ix].CPUs_free = node[node_ix].CPUs_total;
+             }
++            sum_of_node_CPUs_free += node[node_ix].CPUs_free;
++            if (min_node_CPUs_free > node[node_ix].CPUs_free) {
++                min_node_CPUs_free = node[node_ix].CPUs_free;
++                min_node_CPUs_free_ix = node[node_ix].node_id;
++            }
++            if (max_node_CPUs_free < node[node_ix].CPUs_free) {
++                max_node_CPUs_free = node[node_ix].CPUs_free;
++            }
+             node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
+         } else {
+             node[node_ix].CPUs_free = 0;
+             node[node_ix].magnitude = 0;
+         }
+     }
++    avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
++    avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
++    double MBs_variance_sum = 0.0;
++    double CPUs_variance_sum = 0.0;
++    for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
++        double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
++        double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
++        MBs_variance_sum += MBs_diff * MBs_diff;
++        CPUs_variance_sum += CPUs_diff * CPUs_diff;
++    }
++    double MBs_variance = MBs_variance_sum / (num_nodes);
++    double CPUs_variance = CPUs_variance_sum / (num_nodes);
++    stddev_node_MBs_free = sqrt(MBs_variance);
++    stddev_node_CPUs_free = sqrt(CPUs_variance);
+     if (log_level >= LOG_INFO) {
+         show_nodes();
+     }
+@@ -1316,7 +1474,7 @@ typedef struct stat_data {
+     int64_t num_threads;  // 19
+     int64_t itrealvalue;
+     uint64_t starttime;
+-    uint64_t vsize;
++    uint64_t vsize;       // 22
+     int64_t rss;          // 23
+     uint64_t rsslim;
+     uint64_t startcode;
+@@ -1356,15 +1514,16 @@ process_data_p get_stat_data_for_pid(int
+     }
+     static char buf[BUF_SIZE];
+     int bytes = read(fd, buf, BUF_SIZE);
++    close(fd);
+     if (bytes < 50) {
+         numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname);
+         return NULL;
+     }
+-    close(fd);
++    uint64_t val;
+     char *p = buf;
+     static process_data_t data;
+     // Get PID from field 0
+-    uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.pid = val;
+     // Copy comm from field 1
+     while (*p == ' ') { p++; }
+@@ -1373,23 +1532,27 @@ process_data_p get_stat_data_for_pid(int
+     // Skip fields 2 through 12
+     for (int ix = 0;  (ix < 11);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
+     // Get utime from field 13 for cpu_util
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.cpu_util = val;
+     // Get stime from field 14 to add on to cpu_util (which already has utime)
+     while (*p == ' ') { p++; }
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.cpu_util += val;
+     // Skip fields 15 through 18
+     while (*p == ' ') { p++; }
+     for (int ix = 0;  (ix < 4);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
+     // Get num_threads from field 19
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.num_threads = val;
+-    // Skip fields 20 through 22
++    // Skip fields 20 through 21
+     while (*p == ' ') { p++; }
+-    for (int ix = 0;  (ix < 3);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
++    for (int ix = 0;  (ix < 2);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
++    // Get vsize from field 22 to compute MBs_size
++    CONVERT_DIGITS_TO_NUM(p, val);
++    data.MBs_size = val / MEGABYTE;
+     // Get rss from field 23 to compute MBs_used
+-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
++    while (*p == ' ') { p++; }
++    CONVERT_DIGITS_TO_NUM(p, val);
+     data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
+     // Return pointer to data
+     return &data;
+@@ -1471,446 +1634,409 @@ int update_processes() {
+ }
+
+
++int initialize_mem_node_list(process_data_p p) {
++    // Parameter p is a pointer to an element in the hash table
++    if ((!p) || (p->pid < 1)) {
++        numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
++        exit(EXIT_FAILURE);
++    }
++    int n = 0;
++    char fname[FNAME_SIZE];
++    char buf[BIG_BUF_SIZE];
++    p->process_MBs = NULL;
++    CLEAR_NODE_LIST(p->node_list_p);
++    snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
++    int fd = open(fname, O_RDONLY, 0);
++    if (fd < 0) {
++        numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
++        return 0;  // Assume the process terminated
++    }
++    int bytes = read(fd, buf, BIG_BUF_SIZE);
++    close(fd);
++    if (bytes <= 0) {
++        numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid);
++        return 0;  // Assume the process terminated
++    } else if (bytes >= BIG_BUF_SIZE) {
++        buf[BIG_BUF_SIZE - 1] = '\0';
++    } else {
++        buf[bytes] = '\0';
++    }
++    char *list_str_p = strstr(buf, "Mems_allowed_list:");
++    if (!list_str_p) {
++        numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
++        exit(EXIT_FAILURE);
++    }
++    list_str_p += 18;
++    while (!isdigit(*list_str_p)) { list_str_p++; }
++    n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
++    if (n < num_nodes) {
++        // If process already bound to a subset of nodes when we discover it,
++        // set initial bind_time_stamp to 30 minutes ago...
++        p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
++    }
++    return n;
++}
+
+-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
+-    char buf[BUF_SIZE];
+-    char buf2[BUF_SIZE];
++
++uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
++    int64_t needed_mem;
++    int64_t needed_cpu;
++    int64_t excess_mem;
++    int64_t excess_cpu;
++    if (MBs_free > mbs) {
++        needed_mem = mbs;
++        excess_mem = MBs_free - mbs;
++    } else {
++        needed_mem = MBs_free;
++        excess_mem = 0;
++    }
++    if (CPUs_free > cpus) {
++        needed_cpu = cpus;
++        excess_cpu = CPUs_free - cpus;
++    } else {
++        needed_cpu = CPUs_free;
++        excess_cpu = 0;
++    }
++    // Weight the available resources, and then calculate magnitude as
++    // product of available CPUs and available MBs.
++    int64_t memfactor = (needed_mem * 10 + excess_mem * 4);
++    int64_t cpufactor = (needed_cpu *  6 + excess_cpu * 1);
++    numad_log(LOG_DEBUG, "    Node[%d]: mem: %ld  cpu: %ld\n", ix, memfactor, cpufactor);
++    return (memfactor * cpufactor);
++}
++
++
++id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
+     if (log_level >= LOG_DEBUG) {
+         numad_log(LOG_DEBUG, "PICK NODES FOR:  PID: %d,  CPUs %d,  MBs %d\n", pid, cpus, mbs);
+     }
+-    int num_existing_mems = 0;
+-    static id_list_p existing_mems_list_p;
+-    CLEAR_LIST(existing_mems_list_p);
+-    uint64_t time_stamp = get_time_stamp();
+-    static node_data_p tmp_node;
+-    static uint64_t *process_MBs;
+-    static uint64_t *saved_magnitude_for_node;
+-    static int process_MBs_num_nodes;
+-    // See if dynamic structures need to grow.
+-    if (process_MBs_num_nodes < num_nodes + 1) {
+-        process_MBs_num_nodes = num_nodes + 1;
+-        // The "+1 node" is for accumulating interleaved memory
+-        process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
+-        tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
+-        saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
+-        if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
+-            numad_log(LOG_CRIT, "process_MBs realloc failed\n");
+-            exit(EXIT_FAILURE);
+-        }
+-    }
++    char buf[BUF_SIZE];
++    uint64_t proc_avg_node_CPUs_free = 0;
+     // For existing processes, get miscellaneous process specific details
+     int pid_ix;
+     process_data_p p = NULL;
+     if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
+         p = &process_hash_table[pid_ix];
+-        // Quick rejection if this process has interleaved memory, but recheck it once an hour...
+-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
+-        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
+-          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
+-            }
+-            return NULL;
+-        }
+-        // Get cpuset name for this process, and existing mems binding, if any.
++        // Add up per-node memory in use by this process.
++        // This scanning is expensive and should be minimized.
+         char fname[FNAME_SIZE];
+-        snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
+-        FILE *fs = fopen(fname, "r");
+-        if (!fs) {
+-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
+-            return NULL;  // Assume the process terminated?
+-        }
+-        if (!fgets(buf, BUF_SIZE, fs)) {
+-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
+-            fclose(fs);
+-            return NULL;  // Assume the process terminated?
+-        }
+-        fclose(fs);
+-        ELIM_NEW_LINE(buf);
+-        if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
+-            if (p->cpuset_name != NULL) {
+-                free(p->cpuset_name);
+-            }
+-            p->cpuset_name = strdup(buf);
+-        }
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
+-        }
+-        snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
+-        fs = fopen(fname, "r");
+-        if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
+-            fclose(fs);
+-            num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
+-            if (log_level >= LOG_DEBUG) {
+-                str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
+-                numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
+-            }
+-        }
+-        // If this process was just recently bound, enforce a minimum delay
+-        // period between repeated attempts to potentially move the memory.
+-        // FIXME: ?? might this retard appropriate process expansion too much?
+-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
+-        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
+-            // Skip re-evaluation because we just did it recently.
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
+-            }
+-            return NULL;
+-        }
+-        // Look for short cut because of duplicate bindings.  If we have bound
+-        // this process to the same nodes multiple times already, and the load
+-        // on those nodes still seems acceptable, skip the rest of this and
+-        // just return NULL to indicate no change needed.  FIXME: should figure
+-        // out what can change that would make a rebinding desirable (e.g. (1)
+-        // some process gets sub-optimal allocation on busy machine which
+-        // subsequently becomes less busy leaving disadvantaged process. (2)
+-        // node load imbalance, (3) any process split across nodes which should
+-        // fit within a single node.) For now, just expire the dup_bid_count
+-        // occasionally, which is a reasonably good mitigation.
+-        // So, check to see if we should decay the dup_bind_count...
+-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
+-        if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
+-            p->dup_bind_count -= 1;
+-        }
+-        // Now, look for short cut because of duplicate bindings
+-        if (p->dup_bind_count > 0) {
+-            int node_id = 0;
+-            int nodes_have_cpu = 1;
+-            int nodes_have_ram = 1;
+-            int n = num_existing_mems;
+-            int min_resource_pct = 100 - target_utilization;
+-            if (min_resource_pct < 5) {
+-                min_resource_pct = 5;
+-            }
+-            while (n) {
+-                if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-                    nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
+-                    nodes_have_ram &= ((100 * node[node_id].MBs_free  / node[node_id].MBs_total)  >= (min_resource_pct));
+-                    n -= 1;
+-                }
+-                node_id += 1;
+-            }
+-            if ((nodes_have_cpu) && (nodes_have_ram)) {
+-                if (log_level >= LOG_DEBUG) {
+-                    numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
+-                }
+-                return NULL;
+-            }
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
+-            }
+-        }
+-        // Fourth, add up per-node memory in use by this process. This scanning
+-        // is expensive and should be minimized.  Also, old kernels dismantle
+-        // transparent huge pages while producing the numa_maps memory
+-        // information!
+-        memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
+         snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
+-        fs = fopen(fname, "r");
++        FILE *fs = fopen(fname, "r");
+         if (!fs) {
+             numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
+             return NULL;  // Assume the process terminated
+         }
++        // Allocate and zero per node memory array.
++        // The "+1 node" is for accumulating interleaved memory
++        p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t));
++        if (p->process_MBs == NULL) {
++            numad_log(LOG_CRIT, "p->process_MBs realloc failed\n");
++            exit(EXIT_FAILURE);
++        }
++        memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t));
+         int process_has_interleaved_memory = 0;
+         while (fgets(buf, BUF_SIZE, fs)) {
+             int interleaved_memory = 0;
+             uint64_t page_size = page_size_in_bytes;
+             const char *delimiters = " \n";
+-            char *p = strtok(buf, delimiters);
+-            while (p) {
+-                if (!strncmp(p, "interleave", 10)) {
++            char *str_p = strtok(buf, delimiters);
++            while (str_p) {
++                if (!strncmp(str_p, "interleave", 10)) {
+                     interleaved_memory = 1;
+                     process_has_interleaved_memory = 1;
+-                } else if (!strcmp(p, "huge")) {
++                } else if (!strcmp(str_p, "huge")) {
+                     page_size = huge_page_size_in_bytes;
+-                } else if (*p++ == 'N') {
++                } else if (*str_p++ == 'N') {
+                     int node;
+                     uint64_t pages;
+-                    CONVERT_DIGITS_TO_NUM(p, node);
+-                    if (*p++ != '=') {
++                    CONVERT_DIGITS_TO_NUM(str_p, node);
++                    if (*str_p++ != '=') {
+                         numad_log(LOG_CRIT, "numa_maps node number parse error\n");
+                         exit(EXIT_FAILURE);
+                     }
+-                    CONVERT_DIGITS_TO_NUM(p, pages);
+-                    process_MBs[node] += (pages * page_size);
++                    CONVERT_DIGITS_TO_NUM(str_p, pages);
++                    p->process_MBs[node] += (pages * page_size);
+                     if (interleaved_memory) {
+                         // sum interleaved quantity in "extra node"
+-                        process_MBs[num_nodes] += (pages * page_size);
++                        p->process_MBs[num_nodes] += (pages * page_size);
+                     }
+                 }
+                 // Get next token on the line
+-                p = strtok(NULL, delimiters);
++                str_p = strtok(NULL, delimiters);
+             }
+         }
+         fclose(fs);
++        proc_avg_node_CPUs_free = p->CPUs_used;
+         for (int ix = 0;  (ix <= num_nodes);  ix++) {
+-            process_MBs[ix] /= MEGABYTE;
+-            if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
++            p->process_MBs[ix] /= MEGABYTE;
++            if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) {
++                if (ix == num_nodes) {
++                    numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]);
++                } else {
++                    numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]);
++                }
++            }
++            if (ID_IS_IN_LIST(ix, p->node_list_p)) {
++                proc_avg_node_CPUs_free += node[ix].CPUs_free;
+             }
+         }
++        proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p);
+         if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
+             // Mark this process as having interleaved memory so we do not
+-            // merge the interleaved memory.  Time stamp it as done.
++            // merge the interleaved memory.  Time stamp it as done and return.
+             p->flags |= PROCESS_FLAG_INTERLEAVED;
+             p->bind_time_stamp = get_time_stamp();
+             if (log_level >= LOG_DEBUG) {
+-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
++                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
+             }
+             return NULL;
+         }
+     }  // end of existing PID conditional
+     // Make a copy of node available resources array.  Add in info specific to
+     // this process to equalize available resource quantities wrt locations of
+-    // resources already in use by this process.  Inflate the value of already
+-    // assigned memory by approximately 3/2, because moving memory is
+-    // expensive.  Average the amount of CPUs_free across the existing nodes
+-    // used, because the threads are free to move around in that domain.  After
+-    // calculating combined magnitude of available resources, bias the values
+-    // towards existing locations for this process.
+-    int target_using_all_nodes = 0;
+-    uint64_t node_CPUs_free_for_this_process = 0;
+-    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
+-    if (num_existing_mems > 0) {
+-        node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
+-        int node_id = 0;
+-        int n = num_existing_mems;
+-        while (n) {
+-            if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-                node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
+-                n -= 1;
+-            }
+-            node_id += 1;
+-        }
+-        // Divide to get average CPUs_free for the nodes in use by process
+-        node_CPUs_free_for_this_process /= num_existing_mems;
++    // resources already in use by this process.
++    static node_data_p tmp_node;
++    tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
++    if (tmp_node == NULL) {
++        numad_log(LOG_CRIT, "tmp_node realloc failed\n");
++        exit(EXIT_FAILURE);
+     }
++    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
++    uint64_t sum_of_node_CPUs_free = 0;
+     for (int ix = 0;  (ix < num_nodes);  ix++) {
+         if (pid > 0) {
+-            tmp_node[ix].MBs_free  += ((process_MBs[ix] * 12) / 8);
+-        }
+-        if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
+-            tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
+-        }
+-        if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
+-            tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
+-        }
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
++            if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) {
++                // Process not yet bound to a subset of nodes.
++                // Add back memory used by this process on this node.
++                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16);  // Apply light mem bias
++                // Add back CPU used by this process in proportion to the memory used on this node.
++                tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used);
++            } else {
++                // If the process is currently running on less than all the
++                // nodes, first add back (biased) memory already used by this
++                // process on this node, then assign average process CPU / node
++                // for this process iff the process is present on this node.
++                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4);  // Apply heavy mem bias
++                if (ID_IS_IN_LIST(ix, p->node_list_p)) {
++                    tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free;
++                }
++            }
++            sum_of_node_CPUs_free += tmp_node[ix].CPUs_free;
++            if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
++                tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
++            }
++            if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
++                tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
++            }
+         }
+-        // Calculate magnitude as product of available CPUs and available MBs
+-        tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
+-        // Bias combined magnitude towards already assigned nodes
+-        if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
+-            tmp_node[ix].magnitude *= 9;
+-            tmp_node[ix].magnitude /= 8;
++        // Enforce 1/100th CPU minimum
++        if (tmp_node[ix].CPUs_free < 1) {
++            tmp_node[ix].CPUs_free = 1;
+         }
+-        // Save the current magnitudes
+-        saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
++        // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld  cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
++        tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
+     }
+-    // OK, figure out where to get resources for this request.
++    // Now figure out where to get resources for this request....
+     static id_list_p target_node_list_p;
+-    CLEAR_LIST(target_node_list_p);
+-    int prev_node_used = -1;
+-    // Continue to allocate more resources until request are met.
+-    // OK if not not quite all the CPU request is met.
+-    // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
+-    int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200;
+-    if (pid <= 0) {
+-        // If trying to find resources for pre-placement advice request, do not
+-        // underestimate the amount of CPUs needed.  Instead, err on the side
+-        // of providing too many resources.  So, no flexing here...
+-        cpu_flex = 0;
+-    }
+-    while ((mbs > 0) || (cpus > cpu_flex)) {
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
++    CLEAR_NODE_LIST(target_node_list_p);
++    if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) {
++        // System CPUs might be oversubscribed, but...
++        assume_enough_cpus = 1;
++        // and rely on available memory for placement.
++    }
++    // Establish a CPU flex fudge factor, on the presumption it is OK if not
++    // quite all the CPU request is met.  However, if trying to find resources
++    // for pre-placement advice request, do not underestimate the amount of
++    // CPUs needed.  Instead, err on the side of providing too many resources.
++    int cpu_flex = 0;
++    if ((pid > 0) && (target_utilization < 100)) {
++        // FIXME: Is half of the utilization margin a good amount of CPU flexing?
++        cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200;
++    }
++    // Figure out minimum number of nodes required
++    int mem_req_nodes = ceil((double)mbs  / (double)node[0].MBs_total);
++    int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total);
++    int min_req_nodes = mem_req_nodes;
++    if (min_req_nodes < cpu_req_nodes) {
++        min_req_nodes = cpu_req_nodes;
++    }
++    if (min_req_nodes > num_nodes) {
++        min_req_nodes = num_nodes;
++    }
++    // Use an index to sort NUMA connected resource chain for each node
++    int index[num_nodes];
++    uint64_t totmag[num_nodes];
++    for (int ix = 0;  (ix < num_nodes);  ix++) {
++        // Reset the index each time
++        for (int n = 0;  (n < num_nodes);  n++) {
++            index[n] = n;
+         }
+-        // Sort nodes by magnitude of available resources.  Note that
+-        // inter-node distances (to the previous node used) are factored into
+-        // the sort.
++        // Sort by minimum relative NUMA distance from node[ix],
++        // breaking distance ties with magnitude of available resources
+         for (int ij = 0;  (ij < num_nodes);  ij++) {
+-            int big_ix = ij;
++            int best_ix = ij;
+             for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
+-                uint64_t ik_dist = 1;
+-                uint64_t big_ix_dist = 1;
+-                if (prev_node_used >= 0) {
+-                    ik_dist = tmp_node[ik].distance[prev_node_used];
+-                    big_ix_dist = tmp_node[big_ix].distance[prev_node_used];
+-                }
+-                // Scale magnitude comparison by distances to previous node used...
+-                if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) {
+-                    big_ix = ik;
+-                }
+-            }
+-            if (big_ix != ij) {
+-                node_data_t tmp;
+-                memcpy((void *)&tmp, (void *)&tmp_node[ij], sizeof(node_data_t) );
+-                memcpy((void *)&tmp_node[ij], (void *)&tmp_node[big_ix], sizeof(node_data_t) );
+-                memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) );
++                int ik_dist = tmp_node[index[ik]].distance[ix];
++                int best_ix_dist = tmp_node[index[best_ix]].distance[ix];
++                if (best_ix_dist > ik_dist) {
++                    best_ix = ik;
++                } else if (best_ix_dist == ik_dist) {
++                    if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
++                        best_ix = ik;
++                    }
++                }
++            }
++            if (best_ix != ij) {
++                int tmp = index[ij];
++                index[ij] = index[best_ix];
++                index[best_ix] = tmp;
+             }
+         }
++#if 0
+         if (log_level >= LOG_DEBUG) {
+-            for (int ix = 0;  (ix < num_nodes);  ix++) {
+-                numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude);
++            for (int iq = 0;  (iq < num_nodes);  iq++) {
++                numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
++                    tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude);
++            }
++        }
++#endif
++        // Save the totmag[] sum of the magnitudes of expected needed nodes,
++        // "normalized" by NUMA distance (by dividing each magnitude by the
++        // relative distance squared).
++        totmag[ix] = 0;
++        for (int ij = 0;  (ij < min_req_nodes);  ij++) {
++            int dist = tmp_node[index[ij]].distance[ix];
++            totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist));
++        }
++        numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]);
++    }
++    // Now find the best NUMA node based on the normalized sum of node
++    // magnitudes expected to be used.
++    int best_node_ix = 0;
++    for (int ix = 0;  (ix < num_nodes);  ix++) {
++        if (totmag[best_node_ix] < totmag[ix]) {
++            best_node_ix = ix;
++        }
++    }
++    numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix);
++    // Reset sorting index again
++    for (int n = 0;  (n < num_nodes);  n++) {
++        index[n] = n;
++    }
++    // Sort index by distance from node[best_node_ix],
++    // breaking distance ties with magnitude
++    for (int ij = 0;  (ij < num_nodes);  ij++) {
++        int best_ix = ij;
++        for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
++            int ik_dist = tmp_node[index[ik]].distance[best_node_ix];
++            int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix];
++            if (best_ix_dist > ik_dist) {
++                best_ix = ik;
++            } else if (best_ix_dist == ik_dist) {
++                if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
++                    best_ix = ik;
++                }
+             }
+         }
+-        if (tmp_node[0].node_id == prev_node_used) {
+-            // Hmmm.  Looks like the best node for more resources, is also the
+-            // last one we used.  This is not going to make progress...  So
+-            // just punt and use everything.
+-            OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
+-            target_using_all_nodes = 1;
+-            break;
++        if (best_ix != ij) {
++            int tmp = index[ij];
++            index[ij] = index[best_ix];
++            index[best_ix] = tmp;
++        }
++    }
++    if (log_level >= LOG_DEBUG) {
++        for (int iq = 0;  (iq < num_nodes);  iq++) {
++            numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
++                tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude);
+         }
+-        prev_node_used = tmp_node[0].node_id;
+-        ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
++    }
++    // Allocate more resources until request is met.
++    best_node_ix = 0;
++    while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
+         if (log_level >= LOG_DEBUG) {
+-            str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
+-            str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+-            numad_log(LOG_DEBUG, "Existing nodes: %s  Target nodes: %s\n", buf, buf2);
++            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
+         }
++        numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]);
++        ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p);
++        min_req_nodes -= 1;
+         if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
+             // Apparently we must use all resource nodes...
+-            target_using_all_nodes = 1;
+             break;
+         }
+-#define MBS_MARGIN 10
+-        if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
+-            tmp_node[0].MBs_free -= mbs;
++        // "Consume" the resources on this node
++#define CPUS_MARGIN 0
++#define MBS_MARGIN 100
++        if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) {
++            tmp_node[index[best_node_ix]].MBs_free -= mbs;
+             mbs = 0;
+         } else {
+-            mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
+-            tmp_node[0].MBs_free = MBS_MARGIN;
++            mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN);
++            tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN;
+         }
+-#define CPUS_MARGIN 0
+-        if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
+-            tmp_node[0].CPUs_free -= cpus;
++        if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) {
++            tmp_node[index[best_node_ix]].CPUs_free -= cpus;
+             cpus = 0;
+         } else {
+-            cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
+-            tmp_node[0].CPUs_free = CPUS_MARGIN;
+-        }
+-        tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
+-    }
+-    // If this existing process is already located where we want it, and almost
+-    // all memory is already moved to those nodes, then return NULL indicating
+-    // no need to change binding this time.
+-    if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
+-        // May not need to change binding.  However, if there is any significant
+-        // memory still on non-target nodes, advise the bind anyway because
+-        // there are some scenarios when the kernel will not move it all the
+-        // first time.
+-        if (!target_using_all_nodes) {
+-            p->dup_bind_count += 1;
+-            for (int ix = 0;  (ix < num_nodes);  ix++) {
+-                if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
+-                    goto try_memory_move_again;
+-                }
+-            }
+-            // We will accept these memory locations.  Stamp it as done.
+-            p->bind_time_stamp = get_time_stamp();
+-        }
+-        // Skip rebinding either because practically all memory is in the
+-        // target nodes, or because we are stuck using all the nodes.
+-        if (log_level >= LOG_DEBUG) {
+-            numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
++            cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN);
++            tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN;
+         }
+-        return NULL;
+-    } else {
+-        // Either a non-existing process, or a new binding for an existing process.
+-        if (p != NULL) {
+-            // Must be a new binding for an existing process, so reset dup_bind_count.
+-            p->dup_bind_count = 0;
+-        }
+-    }
+-    // See if this proposed move will make a significant difference.
+-    // If not, return null instead of advising the move.
+-    uint64_t target_magnitude = 0;
+-    uint64_t existing_magnitude = 0;
+-    int num_target_nodes   = NUM_IDS_IN_LIST(target_node_list_p);
+-    int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
+-    /* FIXME: this expansion seems to cause excessive growth
+-     * So calculate the improvement before hastily expanding nodes.
+-    if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
+-    */
+-    int node_id = 0;
+-    int n = num_existing_nodes + num_target_nodes;
+-    while (n) {
+-        if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
+-            target_magnitude += saved_magnitude_for_node[node_id];
+-            n -= 1;
+-        }
+-        if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
+-            existing_magnitude += saved_magnitude_for_node[node_id];
+-            n -= 1;
+-        }
+-        node_id += 1;
+-    }
+-    if (existing_magnitude > 0) {
+-        uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
+-        if (magnitude_change < 0) {
+-            magnitude_change = -(magnitude_change);
+-        }
+-        if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
+-            // Not significant enough percentage change to do rebind
++        // Next line optional, since we will not look at that node again
++        tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free);
++        best_node_ix += 1;
++    }
++    // For existing processes, calculate the non-local memory percent to see if
++    // process is already in the right place.
++    if ((pid > 0) && (p != NULL)) {
++        uint64_t nonlocal_memory = 0;
++        for (int ix = 0;  (ix < num_nodes);  ix++) {
++            if (!ID_IS_IN_LIST(ix, target_node_list_p)) {
++                // Accumulate total of nonlocal memory
++                nonlocal_memory += p->process_MBs[ix];
++            }
++        }
++        int disp_percent = (100 * nonlocal_memory) / p->MBs_used;
++        // If this existing process is already located where we want it, then just
++        // return NULL indicating no need to change binding this time.  Check the
++        // ammount of nonlocal memory against the target_memlocality_perecent.
++        if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
++            // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind
+             if (log_level >= LOG_DEBUG) {
+-                str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
+-                str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+-                numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
+-                    pid, buf, buf2, magnitude_change);
++                numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent);
+             }
+-            // We decided this is almost good enough.  Stamp it as done.
+             p->bind_time_stamp = get_time_stamp();
+             return NULL;
+         }
+     }
+-    if ((pid <= 0) && (num_target_nodes <= 0)) {
+-        // Always provide at least one node for pre-placement advice
++    // Must always provide at least one node for pre-placement advice
++    // FIXME: verify this can happen only if no resources requested...
++    if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
+         ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
+     }
+-try_memory_move_again:
+-    str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
++    // Log advice, and return target node list
++    if ((pid > 0) && (p->bind_time_stamp)) {
++        str_from_id_list(buf,  BUF_SIZE, p->node_list_p);
++    } else {
++        str_from_id_list(buf,  BUF_SIZE, all_nodes_list_p);
++    }
++    char buf2[BUF_SIZE];
+     str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
+     char *cmd_name = "(unknown)";
+     if ((p) && (p->comm)) {
+         cmd_name = p->comm;
+     }
+     numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
++    if (pid > 0) {
++        COPY_LIST(target_node_list_p, p->node_list_p);
++    }
+     return target_node_list_p;
+ }
+
+
+-
+-void show_processes(process_data_p *ptr, int nprocs) {
+-    time_t ts = time(NULL);
+-    fprintf(log_fs, "%s", ctime(&ts));
+-    fprintf(log_fs, "Candidates: %d\n", nprocs);
+-    for (int ix = 0;  (ix < nprocs);  ix++) {
+-        process_data_p p = ptr[ix];
+-        char buf[BUF_SIZE];
+-        snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
+-        FILE *fs = fopen(buf, "r");
+-        buf[0] = '\0';
+-        if (fs) {
+-            if (fgets(buf, BUF_SIZE, fs)) {
+-                ELIM_NEW_LINE(buf);
+-            }
+-            fclose(fs);
+-        }
+-        fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
+-            p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
+-        }
+-    fprintf(log_fs, "\n");
+-    fflush(log_fs);
+-}
+-
+-
+-
+ int manage_loads() {
++    uint64_t time_stamp = get_time_stamp();
+     // Use temporary index to access and sort hash table entries
+-    static process_data_p *pindex;
+     static int pindex_size;
++    static process_data_p *pindex;
+     if (pindex_size < process_hash_table_size) {
+         pindex_size = process_hash_table_size;
+         pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
+@@ -1923,19 +2049,54 @@ int manage_loads() {
+         return min_interval / 2;
+     }
+     memset(pindex, 0, pindex_size * sizeof(process_data_p));
+-    // Copy live candidate pointers to the index for sorting, etc
++    // Copy live candidate pointers to the index for sorting
++    // if they meet the threshold for memory usage and CPU usage.
+     int nprocs = 0;
++    long sum_CPUs_used = 0;
+     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
+         process_data_p p = &process_hash_table[ix];
+-        if (p->pid) {
++        if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) {
+             pindex[nprocs++] = p;
++            sum_CPUs_used += p->CPUs_used;
++            // Initialize node list, if not already done for this process.
++            if (p->node_list_p == NULL) {
++                initialize_mem_node_list(p);
++            }
+         }
+     }
+-    // Sort index by amount of CPU used * amount of memory used.  Not expecting
+-    // a long list here.  Use a simple sort -- however, sort into bins,
+-    // treating values within 10% as aquivalent.  Within bins, order by
+-    // bind_time_stamp so oldest bound will be higher priority to evaluate.
++    // Order candidate considerations using timestamps and magnitude: amount of
++    // CPU used * amount of memory used.  Not expecting a long list here.  Use
++    // a simplistic sort -- however move all not yet bound to front of list and
++    // order by decreasing magnitude.  Previously bound processes follow in
++    // bins of increasing magnitude treating values within 20% as aquivalent.
++    // Within bins, order by bind_time_stamp so oldest bound will be higher
++    // priority to evaluate.  Start by moving all unbound to beginning.
++    int num_unbound = 0;
+     for (int ij = 0;  (ij < nprocs);  ij++) {
++        if (pindex[ij]->bind_time_stamp == 0) {
++            process_data_p tmp = pindex[num_unbound];
++            pindex[num_unbound++] = pindex[ij];
++            pindex[ij] = tmp;
++        }
++    }
++    // Sort all unbound so biggest magnitude comes first
++    for (int ij = 0;  (ij < num_unbound);  ij++) {
++        int best = ij;
++        for (int ik = ij + 1;  (ik < num_unbound);  ik++) {
++            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
++            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
++            if (ik_mag <= best_mag) continue;
++            best = ik;
++        }
++        if (best != ij) {
++            process_data_p tmp = pindex[ij];
++            pindex[ij] = pindex[best];
++            pindex[best] = tmp;
++        }
++    }
++    // Sort the remaining candidates into bins of increasting magnitude, and by
++    // timestamp within bins.
++    for (int ij = num_unbound;  (ij < nprocs);  ij++) {
+         int best = ij;
+         for (int ik = ij + 1;  (ik < nprocs);  ik++) {
+             uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
+@@ -1946,11 +2107,11 @@ int manage_loads() {
+                 diff_mag = -(diff_mag);
+                 min_mag = best_mag;
+             }
+-            if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
+-                // difference > 10 percent.  Use strict ordering
++            if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
++                // difference > 20 percent.  Use magnitude ordering
+                 if (ik_mag <= best_mag) continue;
+             } else {
+-                // difference within 10 percent.  Sort these by bind_time_stamp.
++                // difference within 20 percent.  Sort these by bind_time_stamp.
+                 if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
+             }
+             best = ik;
+@@ -1961,23 +2122,57 @@ int manage_loads() {
+             pindex[best] = tmp;
+         }
+     }
++    // Show the candidate processes in the log file
+     if ((log_level >= LOG_INFO) && (nprocs > 0)) {
+-        show_processes(pindex, nprocs);
++        numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
++        for (int ix = 0;  (ix < nprocs);  ix++) {
++            process_data_p p = pindex[ix];
++            char buf[BUF_SIZE];
++            str_from_id_list(buf, BUF_SIZE, p->node_list_p);
++            fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
++                p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
++            }
++        fflush(log_fs);
+     }
+-    // Estimate desired size and make resource requests for each significant process
++    // Estimate desired size (+ margin capacity) and
++    // make resource requests for each candidate process
+     for (int ix = 0;  (ix < nprocs);  ix++) {
+         process_data_p p = pindex[ix];
+-        if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
+-            break; // No more significant processes worth worrying about...
++        // If this process has interleaved memory, recheck it only every 30 minutes...
++#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
++        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
++          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
++            if (log_level >= LOG_DEBUG) {
++                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
++            }
++            continue;
++        }
++        // Expand resources needed estimate using target_utilization factor.
++        // Start with the CPUs actually used (capped by number of threads) for
++        // CPUs required, and the RSS MBs actually used for the MBs
++        // requirement,
++        int mem_target_utilization = target_utilization;
++        int cpu_target_utilization = target_utilization;
++        // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
++        if (mem_target_utilization > 100) {
++            mem_target_utilization = 100;
++        }
++        // If the process virtual memory size is bigger than one node, and it
++        // is already using more than 80 percent of a node, then request MBs
++        // based on the virtual size rather than on the current amount in use.
++        int mb_request;
++        if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) {
++            mb_request = (p->MBs_size * 100) / mem_target_utilization;
++        } else {
++            mb_request = (p->MBs_used * 100) / mem_target_utilization;
+         }
+-        int mb_request  =  (p->MBs_used * 100) / target_utilization;
+-        int cpu_request = (p->CPUs_used * 100) / target_utilization;
+-        // Do not give a process more CPUs than it has threads!
+-        // FIXME: For guest VMs, should limit max to VCPU threads. Will
+-        // need to do something more intelligent with guest IO threads
+-        // when eventually considering devices and IRQs.
++        int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
++        // But do not give a process more CPUs than it has threads!
+         int thread_limit = p->num_threads;
+-        // If process looks like a KVM guest, try to limit to number of vCPU threads
++        // If process looks like a KVM guest, try to limit thread count to the
++        // number of vCPU threads.  FIXME: Will need to do something more
++        // intelligent than this with guest IO threads when eventually
++        // considering devices and IRQs.
+         if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
+             int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
+             if (thread_limit > kvm_vcpu_threads) {
+@@ -1988,23 +2183,51 @@ int manage_loads() {
+         if (cpu_request > thread_limit) {
+             cpu_request = thread_limit;
+         }
++        // If this process was recently bound, enforce a five-minute minimum
++        // delay between repeated attempts to potentially move the process.
++#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED)
++        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
++            // Skip re-evaluation because we just did it recently, but check
++            // first for node utilization balance to see if we should
++            // re-evaluate this particular process right now.  If this process
++            // is running on one of the busiest nodes, go ahead and re-evaluate
++            // it if it looks like it should have a better place with
++            // sufficient resources.  FIXME: this is currently implemented for
++            // only smallish processes that will fit in a single node.
++            if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p))
++                && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total)
++                && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free)
++                    + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free)
++                    < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD)  // CPU slop
++                && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free)
++                    + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free)
++                    < (max_node_MBs_free - min_node_MBs_free)) ) {
++                if (log_level >= LOG_DEBUG) {
++                    numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid);
++                }
++            } else {
++                if (log_level >= LOG_DEBUG) {
++                    numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
++                }
++                continue;
++            }
++        }
++        // OK, now pick NUMA nodes for this process and bind it!
+         pthread_mutex_lock(&node_info_mutex);
+-        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
+-        // FIXME: ?? copy node_list_p to shorten mutex region?
+-        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
+-            // Shorten interval if actively moving processes
++        int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
++        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
++        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
+             pthread_mutex_unlock(&node_info_mutex);
+-            p->bind_time_stamp = get_time_stamp();
++            // Return minimum interval when actively moving processes
+             return min_interval;
+         }
+         pthread_mutex_unlock(&node_info_mutex);
+     }
+-    // Return maximum interval if no process movement
++    // Return maximum interval when no process movement
+     return max_interval;
+ }
+
+
+-
+ void *set_dynamic_options(void *arg) {
+     // int arg_value = *(int *)arg;
+     char buf[BUF_SIZE];
+@@ -2013,6 +2236,18 @@ void *set_dynamic_options(void *arg) {
+         msg_t msg;
+         recv_msg(&msg);
+         switch (msg.body.cmd) {
++        case 'C':
++            use_inactive_file_cache = (msg.body.arg1 != 0);
++            if (use_inactive_file_cache) {
++                numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
++            } else {
++                numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
++            }
++            break;
++        case 'H':
++            thp_scan_sleep_ms = msg.body.arg1;
++            set_thp_scan_sleep_ms(thp_scan_sleep_ms);
++            break;
+         case 'i':
+             min_interval = msg.body.arg1;
+             max_interval = msg.body.arg2;
+@@ -2033,6 +2268,10 @@ void *set_dynamic_options(void *arg) {
+             numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1);
+             log_level = msg.body.arg1;
+             break;
++        case 'm':
++            numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1);
++            target_memlocality = msg.body.arg1;
++            break;
+         case 'p':
+             numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1);
+             pthread_mutex_lock(&pid_list_mutex);
+@@ -2055,6 +2294,11 @@ void *set_dynamic_options(void *arg) {
+                 numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
+             }
+             break;
++        case 't':
++            numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
++            htt_percent = msg.body.arg1;
++            node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
++            break;
+         case 'u':
+             numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
+             target_utilization = msg.body.arg1;
+@@ -2064,7 +2308,7 @@ void *set_dynamic_options(void *arg) {
+                                     msg.body.arg1, msg.body.arg2);
+             pthread_mutex_lock(&node_info_mutex);
+             update_nodes();
+-            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
++            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
+             str_from_id_list(buf, BUF_SIZE, node_list_p);
+             pthread_mutex_unlock(&node_info_mutex);
+             send_msg(msg.body.src_pid, 'w', 0, 0, buf);
+@@ -2134,30 +2378,50 @@ void parse_two_arg_values(char *p, int *
+
+ int main(int argc, char *argv[]) {
+     int opt;
++    int C_flag = 0;
+     int d_flag = 0;
++    int H_flag = 0;
+     int i_flag = 0;
+     int K_flag = 0;
+     int l_flag = 0;
++    int m_flag = 0;
+     int p_flag = 0;
+     int r_flag = 0;
+     int S_flag = 0;
++    int t_flag = 0;
+     int u_flag = 0;
+     int v_flag = 0;
+     int w_flag = 0;
+     int x_flag = 0;
++    int tmp_int = 0;
+     long list_pid = 0;
+-    while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
++    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
+         switch (opt) {
++        case 'C':
++            C_flag = 1;
++            use_inactive_file_cache = (atoi(optarg) != 0);
++            break;
+         case 'd':
+             d_flag = 1;
+             log_level = LOG_DEBUG;
+             break;
+         case 'D':
+-            cpuset_dir_list[0] = strdup(optarg);
++            // obsoleted
+             break;
+         case 'h':
+             print_usage_and_exit(argv[0]);
+             break;
++        case 'H':
++            tmp_int = atoi(optarg);
++            if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
++                // 0 means do not change the system default value
++                H_flag = 1;
++                thp_scan_sleep_ms = tmp_int;
++            } else {
++		fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
++		exit(EXIT_FAILURE);
++	    }
++            break;
+         case 'i':
+             i_flag = 1;
+             parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
+@@ -2170,6 +2434,13 @@ int main(int argc, char *argv[]) {
+             l_flag = 1;
+             log_level = atoi(optarg);
+             break;
++        case 'm':
++            tmp_int = atoi(optarg);
++            if ((tmp_int >= 50) && (tmp_int <= 100)) {
++                m_flag = 1;
++                target_memlocality = tmp_int;
++            }
++            break;
+         case 'p':
+             p_flag = 1;
+             list_pid = atol(optarg);
+@@ -2183,13 +2454,26 @@ int main(int argc, char *argv[]) {
+             include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
+             exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
+             break;
++        case 'R':
++            reserved_cpu_str = strdup(optarg);
++            break;
+         case 'S':
+             S_flag = 1;
+             scan_all_processes = (atoi(optarg) != 0);
+             break;
++        case 't':
++            tmp_int = atoi(optarg);
++            if ((tmp_int >= 0) && (tmp_int <= 100)) {
++                t_flag = 1;
++                htt_percent = tmp_int;
++            }
++            break;
+         case 'u':
+-            u_flag = 1;
+-            target_utilization = atoi(optarg);
++            tmp_int = atoi(optarg);
++            if ((tmp_int >= 10) && (tmp_int <= 130)) {
++                u_flag = 1;
++                target_utilization = tmp_int;
++            }
+             break;
+         case 'v':
+             v_flag = 1;
+@@ -2234,6 +2518,12 @@ int main(int argc, char *argv[]) {
+         // Daemon is already running.  So send dynamic options to persistant
+         // thread to handle requests, get the response (if any), and finish.
+         msg_t msg;
++        if (C_flag) {
++            send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
++        }
++        if (H_flag) {
++            send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
++        }
+         if (i_flag) {
+             send_msg(daemon_pid, 'i', min_interval, max_interval, "");
+         }
+@@ -2243,6 +2533,9 @@ int main(int argc, char *argv[]) {
+         if (d_flag || l_flag || v_flag) {
+             send_msg(daemon_pid, 'l', log_level, 0, "");
+         }
++        if (m_flag) {
++            send_msg(daemon_pid, 'm', target_memlocality, 0, "");
++        }
+         if (p_flag) {
+             send_msg(daemon_pid, 'p', list_pid, 0, "");
+         }
+@@ -2252,6 +2545,9 @@ int main(int argc, char *argv[]) {
+         if (S_flag) {
+             send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
+         }
++        if (t_flag) {
++            send_msg(daemon_pid, 't', htt_percent, 0, "");
++        }
+         if (u_flag) {
+             send_msg(daemon_pid, 'u', target_utilization, 0, "");
+         }
+@@ -2263,14 +2559,30 @@ int main(int argc, char *argv[]) {
+         if (x_flag) {
+             send_msg(daemon_pid, 'x', list_pid, 0, "");
+         }
+-    } else if (w_flag) {
+-        // Get pre-placement NUMA advice without starting daemon
++        close_log_file();
++        exit(EXIT_SUCCESS);
++    }
++    // No numad daemon running yet.
++    // First, make note of any reserved CPUs....
++    if (reserved_cpu_str != NULL) {
++        CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
++        int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
+         char buf[BUF_SIZE];
++        str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
++        numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
++        // turn reserved list into a negated mask for later ANDing use...
++        negate_cpu_list(reserved_cpu_mask_list_p);
++    }
++    // If it is a "-w" pre-placement request, handle that without starting
++    // the daemon.  Otherwise start the numad daemon.
++    if (w_flag) {
++        // Get pre-placement NUMA advice without starting daemon
+         update_nodes();
+         sleep(2);
+         update_nodes();
+         numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
+-        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
++        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
++        char buf[BUF_SIZE];
+         str_from_id_list(buf, BUF_SIZE, node_list_p);
+         fprintf(stdout, "%s\n", buf);
+         close_log_file();
+@@ -2278,6 +2590,7 @@ int main(int argc, char *argv[]) {
+     } else if (max_interval > 0) {
+         // Start the numad daemon...
+         check_prereqs(argv[0]);
++#if (!NO_DAEMON)
+         // Daemonize self...
+         daemon_pid = fork();
+         if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
+@@ -2298,9 +2611,20 @@ int main(int argc, char *argv[]) {
+         if (log_fs != stderr) {
+             fclose(stderr);
+         }
++#endif
++        // Set up signal handlers
++        struct sigaction sa;
++        memset(&sa, 0, sizeof(sa));
++        sa.sa_handler = sig_handler;
++        if (sigaction(SIGHUP, &sa, NULL)
++            || sigaction(SIGTERM, &sa, NULL)
++            || sigaction(SIGQUIT, &sa, NULL)) {
++            numad_log(LOG_CRIT, "sigaction does not work?\n");
++            exit(EXIT_FAILURE);
++        }
+         // Allocate initial process hash table
+         process_hash_table_expand();
+-        // Spawn thread to handle messages from subsequent invocation requests
++        // Spawn a thread to handle messages from subsequent invocation requests
+         pthread_mutex_init(&pid_list_mutex, NULL);
+         pthread_mutex_init(&node_info_mutex, NULL);
+         pthread_attr_t attr;
+@@ -2310,7 +2634,7 @@ int main(int argc, char *argv[]) {
+         }
+         pthread_t tid;
+         if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
+-            numad_log(LOG_CRIT, "pthread_create failure\n");
++            numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
+             exit(EXIT_FAILURE);
+         }
+         // Loop here forwever...
+@@ -2322,16 +2646,26 @@ int main(int argc, char *argv[]) {
+             if (nodes > 1) {
+                 update_processes();
+                 interval = manage_loads();
++                if (interval < max_interval) {
++                    // Update node info since we moved something
++                    nodes = update_nodes();
++                }
+             }
+             sleep(interval);
++            if (got_sigterm | got_sigquit) {
++                shut_down_numad();
++            }
++            if (got_sighup) {
++                got_sighup = 0;
++                close_log_file();
++                open_log_file();
++            }
+         }
+         if (pthread_attr_destroy(&attr) != 0) {
+             numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
+         }
+         pthread_mutex_destroy(&pid_list_mutex);
+         pthread_mutex_destroy(&node_info_mutex);
+-    } else {
+-        shut_down_numad();
+     }
+     exit(EXIT_SUCCESS);
+ }
+diff -rup numad-0.5git/numad.init numad-0.5git-new/numad.init
+--- numad-0.5git/numad.init	2012-12-03 15:40:40.000000000 +0100
++++ numad-0.5git-new/numad.init	2016-08-30 08:45:19.000000000 +0200
+@@ -37,7 +37,7 @@ start() {
+     [ -f $config ] || exit 6
+     echo -n $"Starting $prog: "
+     . $config
+-    daemon "$exec -i $INTERVAL"
++    daemon $exec -i $INTERVAL
+     retval=$?
+     echo
+     [ $retval -eq 0 ] && touch $lockfile
diff --git a/SOURCES/numad.logrotate b/SOURCES/numad.logrotate
new file mode 100644
index 00000000..9ea1ab4b
--- /dev/null
+++ b/SOURCES/numad.logrotate
@@ -0,0 +1,8 @@
+/var/log/numad.log {
+    compress
+    copytruncate
+    maxage 60
+    missingok
+    rotate 5
+    size 1M
+}
diff --git a/SPECS/numad.spec b/SPECS/numad.spec
new file mode 100644
index 00000000..76554827
--- /dev/null
+++ b/SPECS/numad.spec
@@ -0,0 +1,135 @@
+%global systemctl_bin /usr/bin/systemctl
+%global _hardened_build 1
+
+Name: numad
+Version: 0.5
+Release: 18.20150602git%{?dist}
+Summary: NUMA user daemon
+
+License: LGPLv2
+Group: System Environment/Daemons
+URL: http://git.fedorahosted.org/git/?p=numad.git
+# The source for this package was pulled from upstream's vcs.  Use the
+# following commands to generate the tarball:
+#   git clone git://git.fedorahosted.org/numad.git numad-0.5git
+#   tar --exclude-vcs -cJf numad-0.5git.tar.xz numad-0.5git/
+Source0: %{name}-%{version}git.tar.xz
+Source1: %{name}.logrotate
+Patch0: numad-0.5git-pthread.patch
+Patch1: numad-0.5git-version.patch
+Patch2: numad-0.5git-m-option.patch
+
+Requires: systemd-units, initscripts
+Requires(post): systemd-units, initscripts
+Requires(preun): systemd-units, initscripts
+BuildRequires: systemd-units
+
+ExcludeArch: s390 s390x %{arm}
+
+%description
+Numad, a daemon for NUMA (Non-Uniform Memory Architecture) systems,
+that monitors NUMA characteristics and manages placement of processes
+and memory to minimize memory latency and thus provide optimum performance.
+
+%prep
+%setup -q -n %{name}-%{version}git
+%patch0 -p0
+%patch1 -p1
+%patch2 -p1
+
+%build
+make CFLAGS="$RPM_OPT_FLAGS -std=gnu99" LDFLAGS="$RPM_LD_FLAGS -lpthread -lrt -lm"
+
+%install
+mkdir -p %{buildroot}%{_bindir}
+mkdir -p %{buildroot}%{_sysconfdir}/logrotate.d
+mkdir -p %{buildroot}%{_unitdir}
+mkdir -p %{buildroot}%{_mandir}/man8/
+install -p -m 644 numad.conf %{buildroot}%{_sysconfdir}/
+install -p -m 644 numad.service %{buildroot}%{_unitdir}/
+install -p -m 644 %SOURCE1 %{buildroot}%{_sysconfdir}/logrotate.d/%{name}
+make install prefix=%{buildroot}/usr
+
+%files
+%{_bindir}/numad
+%{_unitdir}/numad.service
+%config(noreplace) %{_sysconfdir}/numad.conf
+%config(noreplace) %{_sysconfdir}/logrotate.d/numad
+%doc %{_mandir}/man8/numad.8.gz
+
+%post
+%systemd_post numad.service
+
+%preun
+%systemd_preun numad.service
+
+%postun
+%systemd_postun numad.service
+
+%changelog
+* Mon Oct 30 2017 Jan Synáček <jsynacek@redhat.com> - 0.5-18.20150602git
+- Fix -m option (#1506477)
+
+* Tue Aug 30 2016 Jan Synáček <jsynacek@redhat.com> - 0.5-17.20150602git
+- Fix the version patch (#1281711)
+
+* Mon Jul 11 2016 Jan Synáček <jsynacek@redhat.com> - 0.5-16.20150602git
+- Version update (#1281711 #1238614 #1235164)
+
+* Thu May 26 2016 Jan Synáček <jsynacek@redhat.com> - 0.5-15.20140620git
+- Harden the build (#1092544)
+
+* Fri Sep  5 2014 Jan Synáček <jsynacek@redhat.com> - 0.5-14.20140620git
+- Version update
+- Resolves: #1112109
+
+* Wed Mar 26 2014 Jan Synáček <jsynacek@redhat.com> - 0.5-13.20140225git
+- Build with $RPM_OPT_FLAGS and $RPM_LD_FLAGS
+- Resolves: #1070781
+
+* Fri Feb 28 2014 Jan Synáček <jsynacek@redhat.com> - 0.5-12.20140225git
+- Update source (20140225) and manpage
+- Add logrotate config
+- Resolves: #853232
+
+* Fri Dec 27 2013 Daniel Mach <dmach@redhat.com> - 0.5-11.20121130git
+- Mass rebuild 2013-12-27
+
+* Thu Feb 14 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5-10.20121130git
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_19_Mass_Rebuild
+
+* Tue Dec 11 2012 Jan Synáček <jsynacek@redhat.com> - 0.5-9.20121130git
+- Update and comment the Makefile patch
+- Related: #825153
+
+* Mon Dec 03 2012 Jan Synáček <jsynacek@redhat.com> - 0.5-8.20121130git
+- Update to 20121130
+- Update spec: fix command to generate tarball
+
+* Tue Oct 16 2012 Jan Synáček <jsynacek@redhat.com> - 0.5-7.20121015git
+- Update to 20121015
+- Add Makefile patch
+- Update spec: update command to generate tarball
+
+* Wed Aug 22 2012 Jan Synáček <jsynacek@redhat.com> - 0.5-6.20120522git
+- add systemd-rpm macros
+- Resolves: #850236
+
+* Fri Jul 20 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5-5.20120522git
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild
+
+* Wed May 23 2012 Jan Synáček <jsynacek@redhat.com> - 0.5-4.20120522git
+- update source (20120522) and manpage
+
+* Tue Mar 06 2012 Jan Synáček <jsynacek@redhat.com> 0.5-3.20120221git
+- update source
+- drop the patch
+
+* Fri Feb 24 2012 Jan Synáček <jsynacek@redhat.com> 0.5-2.20120221git
+- add BuildRequires: systemd-units
+
+* Wed Feb 15 2012 Jan Synáček <jsynacek@redhat.com> 0.5-1.20120221git
+- spec update
+
+* Fri Feb 10 2012 Bill Burns <bburns@redhat.com> 0.5-1
+- initial version