You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2256 lines
97 KiB
2256 lines
97 KiB
7 years ago
|
--- numad-0.5git/numad.c 2012-12-03 15:40:40.000000000 +0100
|
||
|
+++ new-rhel7/numad.c 2014-02-27 10:02:58.000000000 +0100
|
||
|
@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
|
||
|
*/
|
||
|
|
||
|
|
||
|
-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
|
||
|
+// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
|
||
|
|
||
|
|
||
|
#define _GNU_SOURCE
|
||
|
@@ -54,7 +54,7 @@ Inc., 59 Temple Place, Suite 330, Boston
|
||
|
#include <values.h>
|
||
|
|
||
|
|
||
|
-#define VERSION_STRING "20121130"
|
||
|
+#define VERSION_STRING "20140225"
|
||
|
|
||
|
|
||
|
#define VAR_RUN_FILE "/var/run/numad.pid"
|
||
|
@@ -86,15 +86,11 @@ char *cpuset_dir_list[] = {
|
||
|
#define MAX_INTERVAL 15
|
||
|
#define CPU_THRESHOLD 50
|
||
|
#define MEMORY_THRESHOLD 300
|
||
|
+#define THP_SCAN_SLEEP_MS 1000
|
||
|
#define TARGET_UTILIZATION_PERCENT 85
|
||
|
-#define IMPROVEMENT_THRESHOLD_PERCENT 5
|
||
|
+#define DEFAULT_HTT_PERCENT 20
|
||
|
|
||
|
|
||
|
-#define ELIM_NEW_LINE(s) \
|
||
|
- if (s[strlen(s) - 1] == '\n') { \
|
||
|
- s[strlen(s) - 1] = '\0'; \
|
||
|
- }
|
||
|
-
|
||
|
#define CONVERT_DIGITS_TO_NUM(p, n) \
|
||
|
n = *p++ - '0'; \
|
||
|
while (isdigit(*p)) { \
|
||
|
@@ -105,19 +101,36 @@ char *cpuset_dir_list[] = {
|
||
|
|
||
|
int num_cpus = 0;
|
||
|
int num_nodes = 0;
|
||
|
+int threads_per_core = 0;
|
||
|
int page_size_in_bytes = 0;
|
||
|
int huge_page_size_in_bytes = 0;
|
||
|
+int thp_scan_sleep_ms = THP_SCAN_SLEEP_MS;
|
||
|
|
||
|
int min_interval = MIN_INTERVAL;
|
||
|
int max_interval = MAX_INTERVAL;
|
||
|
+int htt_percent = DEFAULT_HTT_PERCENT;
|
||
|
int target_utilization = TARGET_UTILIZATION_PERCENT;
|
||
|
int scan_all_processes = 1;
|
||
|
int keep_interleaved_memory = 0;
|
||
|
+int use_inactive_file_cache = 1;
|
||
|
|
||
|
pthread_mutex_t pid_list_mutex;
|
||
|
pthread_mutex_t node_info_mutex;
|
||
|
+long sum_CPUs_total = 0;
|
||
|
int requested_mbs = 0;
|
||
|
int requested_cpus = 0;
|
||
|
+int got_sighup = 0;
|
||
|
+int got_sigterm = 0;
|
||
|
+int got_sigquit = 0;
|
||
|
+
|
||
|
+
|
||
|
+void sig_handler(int signum) {
|
||
|
+ switch (signum) {
|
||
|
+ case SIGHUP: got_sighup = 1; break;
|
||
|
+ case SIGTERM: got_sigterm = 1; break;
|
||
|
+ case SIGQUIT: got_sigquit = 1; break;
|
||
|
+ }
|
||
|
+}
|
||
|
|
||
|
|
||
|
|
||
|
@@ -161,7 +174,9 @@ void open_log_file() {
|
||
|
|
||
|
void close_log_file() {
|
||
|
if (log_fs != NULL) {
|
||
|
- fclose(log_fs);
|
||
|
+ if (log_fs != stderr) {
|
||
|
+ fclose(log_fs);
|
||
|
+ }
|
||
|
log_fs = NULL;
|
||
|
}
|
||
|
}
|
||
|
@@ -233,7 +248,6 @@ void send_msg(long dst_pid, long cmd, lo
|
||
|
}
|
||
|
|
||
|
|
||
|
-
|
||
|
typedef struct id_list {
|
||
|
// Use CPU_SET(3) <sched.h> cpuset bitmasks,
|
||
|
// but bundle size and pointer together
|
||
|
@@ -242,16 +256,22 @@ typedef struct id_list {
|
||
|
size_t bytes;
|
||
|
} id_list_t, *id_list_p;
|
||
|
|
||
|
-#define INIT_ID_LIST(list_p) \
|
||
|
+#define INIT_ID_LIST(list_p, num_elements) \
|
||
|
list_p = malloc(sizeof(id_list_t)); \
|
||
|
if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
|
||
|
- list_p->set_p = CPU_ALLOC(num_cpus); \
|
||
|
+ list_p->set_p = CPU_ALLOC(num_elements); \
|
||
|
if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
|
||
|
- list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
|
||
|
+ list_p->bytes = CPU_ALLOC_SIZE(num_elements);
|
||
|
+
|
||
|
+#define CLEAR_CPU_LIST(list_p) \
|
||
|
+ if (list_p == NULL) { \
|
||
|
+ INIT_ID_LIST(list_p, num_cpus); \
|
||
|
+ } \
|
||
|
+ CPU_ZERO_S(list_p->bytes, list_p->set_p)
|
||
|
|
||
|
-#define CLEAR_LIST(list_p) \
|
||
|
+#define CLEAR_NODE_LIST(list_p) \
|
||
|
if (list_p == NULL) { \
|
||
|
- INIT_ID_LIST(list_p); \
|
||
|
+ INIT_ID_LIST(list_p, num_nodes); \
|
||
|
} \
|
||
|
CPU_ZERO_S(list_p->bytes, list_p->set_p)
|
||
|
|
||
|
@@ -262,6 +282,9 @@ typedef struct id_list {
|
||
|
list_p = NULL; \
|
||
|
}
|
||
|
|
||
|
+#define COPY_LIST(orig_list_p, copy_list_p) \
|
||
|
+ memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
|
||
|
+
|
||
|
#define NUM_IDS_IN_LIST(list_p) CPU_COUNT_S(list_p->bytes, list_p->set_p)
|
||
|
#define ADD_ID_TO_LIST(k, list_p) CPU_SET_S(k, list_p->bytes, list_p->set_p)
|
||
|
#define CLR_ID_IN_LIST(k, list_p) CPU_CLR_S(k, list_p->bytes, list_p->set_p)
|
||
|
@@ -272,6 +295,25 @@ typedef struct id_list {
|
||
|
#define OR_LISTS( or_list_p, list_1_p, list_2_p) CPU_OR_S( or_list_p->bytes, or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
|
||
|
#define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
|
||
|
|
||
|
+int negate_list(id_list_p list_p) {
|
||
|
+ if (list_p == NULL) {
|
||
|
+ numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ if (num_cpus < 1) {
|
||
|
+ numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ for (int ix = 0; (ix < num_cpus); ix++) {
|
||
|
+ if (ID_IS_IN_LIST(ix, list_p)) {
|
||
|
+ CLR_ID_IN_LIST(ix, list_p);
|
||
|
+ } else {
|
||
|
+ ADD_ID_TO_LIST(ix, list_p);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ return NUM_IDS_IN_LIST(list_p);
|
||
|
+}
|
||
|
+
|
||
|
int add_ids_to_list_from_str(id_list_p list_p, char *s) {
|
||
|
if (list_p == NULL) {
|
||
|
numad_log(LOG_CRIT, "Cannot add to NULL list\n");
|
||
|
@@ -340,6 +382,25 @@ terminate_string:
|
||
|
return (p - str_p);
|
||
|
}
|
||
|
|
||
|
+id_list_p all_cpus_list_p = NULL;
|
||
|
+id_list_p all_nodes_list_p = NULL;
|
||
|
+char *reserved_cpu_str = NULL;
|
||
|
+id_list_p reserved_cpu_mask_list_p = NULL;
|
||
|
+uint64_t node_info_time_stamp = 0;
|
||
|
+
|
||
|
+
|
||
|
+int read_one_line(char *buf, int buf_size, char *fname) {
|
||
|
+ int fd = open(fname, O_RDONLY, 0);
|
||
|
+ if (fd < 0) {
|
||
|
+ return fd;
|
||
|
+ }
|
||
|
+ int bytes = read(fd, buf, buf_size);
|
||
|
+ if (buf[bytes - 1] == '\n') {
|
||
|
+ buf[bytes - 1] = '\0';
|
||
|
+ }
|
||
|
+ close(fd);
|
||
|
+ return bytes;
|
||
|
+}
|
||
|
|
||
|
|
||
|
typedef struct node_data {
|
||
|
@@ -355,6 +416,16 @@ typedef struct node_data {
|
||
|
|
||
|
node_data_p node = NULL;
|
||
|
|
||
|
+uint64_t min_node_CPUs_free = MAXINT;
|
||
|
+uint64_t min_node_MBs_free = MAXINT;
|
||
|
+uint64_t max_node_CPUs_free = 0;
|
||
|
+uint64_t max_node_MBs_free = 0;
|
||
|
+uint64_t avg_node_CPUs_free = 0;
|
||
|
+uint64_t avg_node_MBs_free = 0;
|
||
|
+double stddev_node_CPUs_free = 0.0;
|
||
|
+double stddev_node_MBs_free = 0.0;
|
||
|
+
|
||
|
+
|
||
|
// RING_BUF_SIZE must be a power of two
|
||
|
#define RING_BUF_SIZE 8
|
||
|
|
||
|
@@ -366,14 +437,14 @@ typedef struct process_data {
|
||
|
uint64_t data_time_stamp; // hundredths of seconds
|
||
|
uint64_t bind_time_stamp;
|
||
|
uint64_t num_threads;
|
||
|
+ uint64_t MBs_size;
|
||
|
uint64_t MBs_used;
|
||
|
uint64_t cpu_util;
|
||
|
uint64_t CPUs_used; // scaled * ONE_HUNDRED
|
||
|
uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
|
||
|
int ring_buf_ix;
|
||
|
- int dup_bind_count;
|
||
|
char *comm;
|
||
|
- char *cpuset_name;
|
||
|
+ id_list_p node_list_p;
|
||
|
} process_data_t, *process_data_p;
|
||
|
|
||
|
|
||
|
@@ -454,12 +525,15 @@ int process_hash_update(process_data_p n
|
||
|
}
|
||
|
p->CPUs_used = max_CPUs_used;
|
||
|
}
|
||
|
+// FIXME: seems like this comm check should not be necessary every update
|
||
|
+// But it does happen only for candidates that cross the memory threshold...
|
||
|
if ((!p->comm) || (strcmp(p->comm, newp->comm))) {
|
||
|
if (p->comm) {
|
||
|
free(p->comm);
|
||
|
}
|
||
|
p->comm = strdup(newp->comm);
|
||
|
}
|
||
|
+ p->MBs_size = newp->MBs_size;
|
||
|
p->MBs_used = newp->MBs_used;
|
||
|
p->cpu_util = newp->cpu_util;
|
||
|
p->num_threads = newp->num_threads;
|
||
|
@@ -468,6 +542,11 @@ int process_hash_update(process_data_p n
|
||
|
return new_hash_table_entry;
|
||
|
}
|
||
|
|
||
|
+void process_hash_clear_all_bind_time_stamps() {
|
||
|
+ for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
||
|
+ process_hash_table[ix].bind_time_stamp = 0;
|
||
|
+ }
|
||
|
+}
|
||
|
|
||
|
int process_hash_rehash(int old_ix) {
|
||
|
// Given the index of a table entry that would otherwise be orphaned by
|
||
|
@@ -489,7 +568,7 @@ int process_hash_remove(int pid) {
|
||
|
// remove the target
|
||
|
process_data_p dp = &process_hash_table[ix];
|
||
|
if (dp->comm) { free(dp->comm); }
|
||
|
- if (dp->cpuset_name) { free(dp->cpuset_name); }
|
||
|
+ FREE_LIST(dp->node_list_p);
|
||
|
memset(dp, 0, sizeof(process_data_t));
|
||
|
// bubble up the collision chain and rehash if neeeded
|
||
|
for (;;) {
|
||
|
@@ -543,15 +622,29 @@ void process_hash_table_dump() {
|
||
|
process_data_p p = &process_hash_table[ix];
|
||
|
if (p->pid) {
|
||
|
numad_log(LOG_DEBUG,
|
||
|
- "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld Data TS: %ld Bind TS: %ld\n",
|
||
|
+ "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld/%ld Data TS: %ld Bind TS: %ld\n",
|
||
|
ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
|
||
|
- p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
|
||
|
+ p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
|
||
|
+ // FIXME: make this dump every field
|
||
|
+ }
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+void remove_obsolete_cpuset_if_no_tasks(int pid) {
|
||
|
+ // PID parameter has already been checked via kill(0) and seems dead
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ char fname[FNAME_SIZE];
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/numad.%d/tasks", cpuset_dir, pid);
|
||
|
+ if ((access(fname, F_OK) == 0) && (read_one_line(buf, BUF_SIZE, fname) <= 1)) {
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
|
||
|
+ numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
|
||
|
+ if (rmdir(fname) < 0) {
|
||
|
+ numad_log(LOG_ERR, "bad cpuset rmdir\n");
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void process_hash_table_cleanup(uint64_t update_time) {
|
||
|
- int cpusets_removed = 0;
|
||
|
int num_hash_entries_used = 0;
|
||
|
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
||
|
process_data_p p = &process_hash_table[ix];
|
||
|
@@ -562,40 +655,56 @@ void process_hash_table_cleanup(uint64_t
|
||
|
p->data_time_stamp = 0;
|
||
|
p->CPUs_used = 0;
|
||
|
// Check for dead pids and remove them...
|
||
|
- char fname[FNAME_SIZE];
|
||
|
- snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
|
||
|
- if (access(fname, F_OK) < 0) {
|
||
|
+ if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
|
||
|
// Seems dead. Forget this pid -- after first checking
|
||
|
// and removing obsolete numad.PID cpuset directories.
|
||
|
- snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
|
||
|
- if (access(fname, F_OK) == 0) {
|
||
|
- numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
|
||
|
- int rc = rmdir(fname);
|
||
|
- if (rc >= 0) {
|
||
|
- cpusets_removed += 1;
|
||
|
- } else {
|
||
|
- numad_log(LOG_ERR, "bad cpuset rmdir\n");
|
||
|
- // exit(EXIT_FAILURE);
|
||
|
- }
|
||
|
- }
|
||
|
+ remove_obsolete_cpuset_if_no_tasks(p->pid);
|
||
|
process_hash_remove(p->pid);
|
||
|
num_hash_entries_used -= 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
- if (cpusets_removed > 0) {
|
||
|
- // Expire all the duplicate bind counts so things will be re-evaluated sooner.
|
||
|
- for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
||
|
- process_hash_table[ix].dup_bind_count = 0;
|
||
|
- }
|
||
|
- }
|
||
|
// Keep hash table approximately half empty
|
||
|
if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
|
||
|
process_hash_table_expand();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
+static int name_starts_with_numad(const struct dirent *dptr) {
|
||
|
+ return (strncmp(dptr->d_name, "numad.", 6) == 0);
|
||
|
+}
|
||
|
+
|
||
|
+void *clean_obsolete_cpusets(void *arg) {
|
||
|
+ // int arg_value = *(int *)arg;
|
||
|
+ for (;;) {
|
||
|
+ // Loop here forever (slowly) cleaning obsolete cpusets
|
||
|
+ sleep(571); // Arbitrary number a little less than ten minutes
|
||
|
+ struct dirent **namelist;
|
||
|
+ int files = scandir(cpuset_dir, &namelist, name_starts_with_numad, NULL);
|
||
|
+ if (files < 0) {
|
||
|
+ numad_log(LOG_ERR, "Troubled scanning for obsolete cpusets\n");
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+ for (int ix = 0; (ix < files); ix++) {
|
||
|
+ char *p = &(namelist[ix]->d_name[6]);
|
||
|
+ if (isdigit(*p)) {
|
||
|
+ int pid;
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, pid);
|
||
|
+ // If it seems like a valid PID -- that is NOT in the hash
|
||
|
+ // table -- and the process appears to be dead, then try to
|
||
|
+ // delete the cpuset directory. (Dead PIDs we know about in
|
||
|
+ // the hash table will be cleaned separately.)
|
||
|
+ if ((pid > 10) && (process_hash_lookup(pid) < 0)
|
||
|
+ && (kill(pid, 0) == -1) && (errno == ESRCH)) {
|
||
|
+ remove_obsolete_cpuset_if_no_tasks(pid);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ free(namelist[ix]);
|
||
|
+ }
|
||
|
+ free(namelist);
|
||
|
+ }
|
||
|
+}
|
||
|
|
||
|
|
||
|
typedef struct pid_list {
|
||
|
@@ -610,9 +719,7 @@ pid_list_p insert_pid_into_pid_list(pid_
|
||
|
if (process_hash_table != NULL) {
|
||
|
int hash_ix = process_hash_lookup(pid);
|
||
|
if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
|
||
|
- // Clear dup_bind_count and interleaved flag,
|
||
|
- // in case user wants it to be re-evaluated soon
|
||
|
- process_hash_table[hash_ix].dup_bind_count = 0;
|
||
|
+ // Clear interleaved flag, in case user wants it to be re-evaluated
|
||
|
process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
|
||
|
}
|
||
|
}
|
||
|
@@ -678,17 +785,22 @@ void print_version_and_exit(char *prog_n
|
||
|
|
||
|
void print_usage_and_exit(char *prog_name) {
|
||
|
fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
|
||
|
+ fprintf(stderr, "-C 1 to count inactive file cache as available memory (default 1)\n");
|
||
|
+ fprintf(stderr, "-C 0 to count inactive file cache memory as unavailable (default 1)\n");
|
||
|
fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
|
||
|
fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
|
||
|
fprintf(stderr, "-h to print this usage info\n");
|
||
|
+ fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default 1000)\n");
|
||
|
fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
|
||
|
- fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes\n");
|
||
|
- fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes\n");
|
||
|
- fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
|
||
|
+ fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes (default 0)\n");
|
||
|
+ fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes (default 0)\n");
|
||
|
+ fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
|
||
|
fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
|
||
|
fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
|
||
|
- fprintf(stderr, "-S 1 to scan all processes\n");
|
||
|
- fprintf(stderr, "-S 0 to scan only explicit PID list processes\n");
|
||
|
+ fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
|
||
|
+ fprintf(stderr, "-S 1 to scan all processes (default 1)\n");
|
||
|
+ fprintf(stderr, "-S 0 to scan only explicit PID list processes (default 1)\n");
|
||
|
+ fprintf(stderr, "-t <N> to specify thread / logical CPU percent (default 20)\n");
|
||
|
fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
|
||
|
fprintf(stderr, "-v for verbose (same effect as '-l 6')\n");
|
||
|
fprintf(stderr, "-V to show version info\n");
|
||
|
@@ -698,6 +810,32 @@ void print_usage_and_exit(char *prog_nam
|
||
|
}
|
||
|
|
||
|
|
||
|
+void set_thp_scan_sleep_ms(int new_ms) {
|
||
|
+ if (new_ms < 1) {
|
||
|
+ // 0 means do not change the system default
|
||
|
+ return;
|
||
|
+ }
|
||
|
+ char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
|
||
|
+ int fd = open(thp_scan_fname, O_RDWR, 0);
|
||
|
+ if (fd >= 0) {
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ int bytes = read(fd, buf, BUF_SIZE);
|
||
|
+ if (bytes > 0) {
|
||
|
+ int cur_ms;
|
||
|
+ char *p = buf;
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, cur_ms);
|
||
|
+ if (cur_ms != new_ms) {
|
||
|
+ lseek(fd, 0, SEEK_SET);
|
||
|
+ numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
|
||
|
+ sprintf(buf, "%d\n", new_ms);
|
||
|
+ write(fd, buf, strlen(buf));
|
||
|
+ }
|
||
|
+ }
|
||
|
+ close(fd);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
void check_prereqs(char *prog_name) {
|
||
|
// Verify cpusets are available on this system.
|
||
|
char **dir = &cpuset_dir_list[0];
|
||
|
@@ -730,30 +868,8 @@ void check_prereqs(char *prog_name) {
|
||
|
fprintf(stderr, "\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
- // Check on THP scan sleep time.
|
||
|
- char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
|
||
|
- int fd = open(thp_scan_fname, O_RDONLY, 0);
|
||
|
- if (fd >= 0) {
|
||
|
- int ms;
|
||
|
- char buf[BUF_SIZE];
|
||
|
- int bytes = read(fd, buf, BUF_SIZE);
|
||
|
- close(fd);
|
||
|
- if (bytes > 0) {
|
||
|
- char *p = buf;
|
||
|
- CONVERT_DIGITS_TO_NUM(p, ms);
|
||
|
- if (ms > 150) {
|
||
|
- fprintf(stderr, "\n");
|
||
|
- numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
|
||
|
- fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
|
||
|
- fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
|
||
|
- fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
|
||
|
- fprintf(stderr, "to more aggressively (re)construct THPs. For example:\n");
|
||
|
- fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
|
||
|
- fprintf(stderr, "\n");
|
||
|
- }
|
||
|
- }
|
||
|
- }
|
||
|
- // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
|
||
|
+ // Adjust kernel tunable to scan for THP more frequently...
|
||
|
+ set_thp_scan_sleep_ms(thp_scan_sleep_ms);
|
||
|
}
|
||
|
|
||
|
|
||
|
@@ -831,6 +947,43 @@ fail_numad_run_file:
|
||
|
}
|
||
|
|
||
|
|
||
|
+int count_set_bits_in_hex_list_file(char *fname) {
|
||
|
+ int sum = 0;
|
||
|
+ int fd = open(fname, O_RDONLY, 0);
|
||
|
+ if (fd >= 0) {
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ int bytes = read(fd, buf, BUF_SIZE);
|
||
|
+ close(fd);
|
||
|
+ for (int ix = 0; (ix < bytes); ix++) {
|
||
|
+ char c = tolower(buf[ix]);
|
||
|
+ switch (c) {
|
||
|
+ case '0' : sum += 0; break;
|
||
|
+ case '1' : sum += 1; break;
|
||
|
+ case '2' : sum += 1; break;
|
||
|
+ case '3' : sum += 2; break;
|
||
|
+ case '4' : sum += 1; break;
|
||
|
+ case '5' : sum += 2; break;
|
||
|
+ case '6' : sum += 2; break;
|
||
|
+ case '7' : sum += 3; break;
|
||
|
+ case '8' : sum += 1; break;
|
||
|
+ case '9' : sum += 2; break;
|
||
|
+ case 'a' : sum += 2; break;
|
||
|
+ case 'b' : sum += 3; break;
|
||
|
+ case 'c' : sum += 2; break;
|
||
|
+ case 'd' : sum += 3; break;
|
||
|
+ case 'e' : sum += 3; break;
|
||
|
+ case 'f' : sum += 4; break;
|
||
|
+ case ' ' : sum += 0; break;
|
||
|
+ case ',' : sum += 0; break;
|
||
|
+ case '\n' : sum += 0; break;
|
||
|
+ default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+ return sum;
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
int get_num_cpus() {
|
||
|
int n1 = sysconf(_SC_NPROCESSORS_CONF);
|
||
|
int n2 = sysconf(_SC_NPROCESSORS_ONLN);
|
||
|
@@ -916,129 +1069,244 @@ static int name_starts_with_digit(const
|
||
|
}
|
||
|
|
||
|
|
||
|
-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
|
||
|
- // Check basic parameter validity.
|
||
|
- if (pid <= 0) {
|
||
|
+int write_to_cpuset_file(char *fname, char *s) {
|
||
|
+ int fd = open(fname, O_WRONLY | O_TRUNC, 0);
|
||
|
+ if (fd == -1) {
|
||
|
+ numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno);
|
||
|
+ return -1;
|
||
|
+ }
|
||
|
+ numad_log(LOG_DEBUG, "Writing %s to: %s\n", s, fname);
|
||
|
+ if (write(fd, s, strlen(s)) <= 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not write %s to %s -- errno: %d\n", s, fname, errno);
|
||
|
+ return -1;
|
||
|
+ }
|
||
|
+ close(fd);
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+int configure_cpuset(char *cpuset_name, char *node_list_str, char *cpu_list_str) {
|
||
|
+ int rc = 0;
|
||
|
+ char fname[FNAME_SIZE];
|
||
|
+ // Write "1" out to cpuset.memory_migrate file
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
|
||
|
+ rc += write_to_cpuset_file(fname, "1");
|
||
|
+ // For memory binding, write node IDs out to cpuset.mems file
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
|
||
|
+ rc += write_to_cpuset_file(fname, node_list_str);
|
||
|
+ // For CPU binding, write CPU IDs out to cpuset.cpus file
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
|
||
|
+ rc += write_to_cpuset_file(fname, cpu_list_str);
|
||
|
+ return rc;
|
||
|
+}
|
||
|
+
|
||
|
+int bind_process_and_migrate_memory(process_data_p p) {
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ char fname[FNAME_SIZE];
|
||
|
+ char pid_cpuset_name[FNAME_SIZE];
|
||
|
+ uint64_t t0 = get_time_stamp();
|
||
|
+ // Parameter p is a pointer to an element in the hash table
|
||
|
+ if ((!p) || (p->pid < 1)) {
|
||
|
numad_log(LOG_CRIT, "Bad PID to bind\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
- if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
|
||
|
- numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
|
||
|
+ if (!p->node_list_p) {
|
||
|
+ numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
- int nodes;
|
||
|
- if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
|
||
|
- numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
|
||
|
- exit(EXIT_FAILURE);
|
||
|
+ // Get cpuset name for this PID, or make a new cpuset if necessary
|
||
|
+ snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", p->pid);
|
||
|
+ if (read_one_line(buf, BUF_SIZE, fname) <= 0) {
|
||
|
+ numad_log(LOG_WARNING, "Could not get cpuset of PID %d.\n", p->pid);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
}
|
||
|
- // Cpu_list_p is optional and may be NULL...
|
||
|
- // Generate CPU id list from the specified node list if necessary
|
||
|
- if (cpu_list_p == NULL) {
|
||
|
- static id_list_p tmp_cpu_list_p;
|
||
|
- CLEAR_LIST(tmp_cpu_list_p);
|
||
|
- int node_id = 0;
|
||
|
- while (nodes) {
|
||
|
- if (ID_IS_IN_LIST(node_id, node_list_p)) {
|
||
|
- OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
|
||
|
- nodes -= 1;
|
||
|
- }
|
||
|
- node_id += 1;
|
||
|
- }
|
||
|
- cpu_list_p = tmp_cpu_list_p;
|
||
|
- }
|
||
|
- // Make the cpuset directory if necessary
|
||
|
- char cpuset_name_buf[FNAME_SIZE];
|
||
|
- snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
|
||
|
- char *p = &cpuset_name_buf[strlen(cpuset_dir)];
|
||
|
- if (!strcmp(p, "/")) {
|
||
|
- // Make a cpuset directory for this process
|
||
|
- snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
|
||
|
- numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
|
||
|
- int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
|
||
|
- if (rc == -1) {
|
||
|
+ if (!strcmp(buf, "/")) {
|
||
|
+ // Default cpuset name, so make a new cpuset directory for this PID
|
||
|
+ snprintf(pid_cpuset_name, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
|
||
|
+ numad_log(LOG_NOTICE, "Making new cpuset: %s\n", pid_cpuset_name);
|
||
|
+ if (mkdir(pid_cpuset_name, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) < 0) {
|
||
|
numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
|
||
|
return 0;
|
||
|
}
|
||
|
+ // Temporarily enable all CPUs for a new cpuset...
|
||
|
+ char all_cpus_list_buf[BUF_SIZE];
|
||
|
+ str_from_id_list(all_cpus_list_buf, BUF_SIZE, all_cpus_list_p);
|
||
|
+ // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name);
|
||
|
+ if (write_to_cpuset_file(fname, all_cpus_list_buf) < 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not configure cpuset.cpus: %s\n", pid_cpuset_name);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
+ }
|
||
|
+ } else {
|
||
|
+ // Save the existing nondefault cpuset name for this PID
|
||
|
+ snprintf(pid_cpuset_name, FNAME_SIZE, "%s%s", cpuset_dir, buf);
|
||
|
}
|
||
|
- cpuset_name = cpuset_name_buf;
|
||
|
- // Now that we have a cpuset for pid and a populated cpulist,
|
||
|
- // start the actual binding and migration.
|
||
|
- uint64_t t0 = get_time_stamp();
|
||
|
-
|
||
|
+ // Configure the main PID cpuset with desired nodes and memory migrate
|
||
|
+ // flag. Defer the CPU binding for the main PID until after the PID is
|
||
|
+ // actually written to the task file and the memory has been moved.
|
||
|
+ char node_list_buf[BUF_SIZE];
|
||
|
+ str_from_id_list(node_list_buf, BUF_SIZE, p->node_list_p);
|
||
|
// Write "1" out to cpuset.memory_migrate file
|
||
|
- char fname[FNAME_SIZE];
|
||
|
- snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", pid_cpuset_name);
|
||
|
+ if (write_to_cpuset_file(fname, "1") < 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
+ }
|
||
|
+ // For memory binding, write node IDs out to cpuset.mems file
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", pid_cpuset_name);
|
||
|
+ if (write_to_cpuset_file(fname, node_list_buf) < 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
+ }
|
||
|
+ // Open the main PID cpuset tasks file and
|
||
|
+ // bind the main PID in the main cpuset now.
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/tasks", pid_cpuset_name);
|
||
|
int fd = open(fname, O_WRONLY | O_TRUNC, 0);
|
||
|
- if (fd == -1) {
|
||
|
- numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
|
||
|
- return 0;
|
||
|
+ if (fd < 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not open %s -- errno: %d\n", fname, errno);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
}
|
||
|
- write(fd, "1", 1);
|
||
|
- close(fd);
|
||
|
-
|
||
|
- // Write node IDs out to cpuset.mems file
|
||
|
- char node_list_buf[BUF_SIZE];
|
||
|
- snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
|
||
|
- fd = open(fname, O_WRONLY | O_TRUNC, 0);
|
||
|
- if (fd == -1) {
|
||
|
- numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
|
||
|
- return 0;
|
||
|
+ numad_log(LOG_NOTICE, "Including PID: %d in cpuset: %s\n", p->pid, pid_cpuset_name);
|
||
|
+ char pid_str[FNAME_SIZE];
|
||
|
+ snprintf(pid_str, FNAME_SIZE, "%d", p->pid);
|
||
|
+ if (write(fd, pid_str, strlen(pid_str)) <= 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not write %s to cpuset: %s -- errno: %d\n", pid_str, pid_cpuset_name, errno);
|
||
|
+ close(fd);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
}
|
||
|
- int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
|
||
|
- write(fd, node_list_buf, len);
|
||
|
- close(fd);
|
||
|
-
|
||
|
- // Write CPU IDs out to cpuset.cpus file
|
||
|
- char cpu_list_buf[BUF_SIZE];
|
||
|
- snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
|
||
|
- fd = open(fname, O_WRONLY | O_TRUNC, 0);
|
||
|
- if (fd == -1) {
|
||
|
- numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
|
||
|
- return 0;
|
||
|
+ // Generate CPU binding list derived from node bind list.
|
||
|
+ static id_list_p cpu_bind_list_p;
|
||
|
+ CLEAR_CPU_LIST(cpu_bind_list_p);
|
||
|
+ int nodes = NUM_IDS_IN_LIST(p->node_list_p);
|
||
|
+ int node_id = 0;
|
||
|
+ while (nodes) {
|
||
|
+ if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
|
||
|
+ OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
|
||
|
+ nodes -= 1;
|
||
|
+ }
|
||
|
+ node_id += 1;
|
||
|
}
|
||
|
- len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
|
||
|
- write(fd, cpu_list_buf, len);
|
||
|
- close(fd);
|
||
|
-
|
||
|
- // Copy pid tasks one at a time to tasks file
|
||
|
- snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
|
||
|
- fd = open(fname, O_WRONLY | O_TRUNC, 0);
|
||
|
- if (fd == -1) {
|
||
|
- numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
|
||
|
- return 0;
|
||
|
+ char cpu_bind_list_buf[BUF_SIZE];
|
||
|
+ str_from_id_list(cpu_bind_list_buf, BUF_SIZE, cpu_bind_list_p);
|
||
|
+ // Write CPU IDs out to cpuset.cpus file for CPU binding of main PID
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", pid_cpuset_name);
|
||
|
+ if (write_to_cpuset_file(fname, cpu_bind_list_buf) < 0) {
|
||
|
+ numad_log(LOG_CRIT, "Could not configure cpuset: %s\n", pid_cpuset_name);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
}
|
||
|
- snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
|
||
|
+ // Leave fd open in case process is multithreaded and we need to write more
|
||
|
+ // (sub) task IDs there. In case multithreaded, make sure all the subtasks
|
||
|
+ // for this PID are in a cpuset. If not already in cpuset, put them in the
|
||
|
+ // main cpuset. Start by getting the name list of all tasks for this PID.
|
||
|
struct dirent **namelist;
|
||
|
- int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
|
||
|
- if (files < 0) {
|
||
|
- numad_log(LOG_WARNING, "Could not scandir task list\n");
|
||
|
+ snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
|
||
|
+ int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
|
||
|
+ if (num_tasks <= 0) {
|
||
|
+ numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
|
||
|
+ close(fd);
|
||
|
return 0; // Assume the process terminated
|
||
|
}
|
||
|
- for (int ix = 0; (ix < files); ix++) {
|
||
|
- // copy pid tasks, one at a time
|
||
|
- numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
|
||
|
- write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
|
||
|
- free(namelist[ix]);
|
||
|
+ if (num_tasks == 1) {
|
||
|
+ // This is the normal nonthreaded case. No sub tasks -- only the
|
||
|
+ // single main PID task, which is already bound above...
|
||
|
+ free(namelist[0]);
|
||
|
+ } else {
|
||
|
+ // Multithreaded so check all of the multiple subtasks. Avoid redundant
|
||
|
+ // subtask cpuset configuration by keeping a list of unique cpusets as
|
||
|
+ // we check each subtask. If the subtasks have only default cpuset
|
||
|
+ // names, bind those subtasks into the main cpuset with the main PID
|
||
|
+ // instead of adding them to the list. (cpuset_list is static so we
|
||
|
+ // can reuse the allocated array of pointers.)
|
||
|
+ int num_names = 0;
|
||
|
+ static char **cpuset_list;
|
||
|
+ static int cpuset_list_size;
|
||
|
+ for (int ix = 0; (ix < num_tasks); ix++) {
|
||
|
+ // Check the cpuset name for each task
|
||
|
+ if (!strcmp(namelist[ix]->d_name, pid_str)) {
|
||
|
+ // This is the main PID task, which is already bound above. Skip it here.
|
||
|
+ free(namelist[ix]);
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+ snprintf(fname, FNAME_SIZE, "/proc/%d/task/%s/cpuset", p->pid, namelist[ix]->d_name);
|
||
|
+ if (read_one_line(buf, BUF_SIZE, fname) <= 0) {
|
||
|
+ numad_log(LOG_WARNING, "Could not open %s. Assuming thread completed.\n", fname);
|
||
|
+ free(namelist[ix]);
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+ if (strcmp(buf, "/")) {
|
||
|
+ // Subtask already has a nondefault cpuset name. Add this
|
||
|
+ // subtask cpuset name to the list of unique cpuset names. Do
|
||
|
+ // sequential search comparisons first to verify uniqueness.
|
||
|
+ snprintf(fname, FNAME_SIZE, "%s%s", cpuset_dir, buf);
|
||
|
+ int iy = 0;
|
||
|
+ while (iy < num_names) {
|
||
|
+ if (!strcmp(fname, cpuset_list[iy])) {
|
||
|
+ break; // because we already have this cpuset name in the list
|
||
|
+ }
|
||
|
+ iy += 1;
|
||
|
+ }
|
||
|
+ if (iy == num_names) {
|
||
|
+ // We got to the end of the cpulist, so this is a new cpuset name not yet in the list
|
||
|
+ if (num_names == cpuset_list_size) {
|
||
|
+ if (cpuset_list_size == 0) {
|
||
|
+ cpuset_list_size = 10;
|
||
|
+ } else {
|
||
|
+ cpuset_list_size *= 2;
|
||
|
+ }
|
||
|
+ cpuset_list = realloc(cpuset_list, (cpuset_list_size * sizeof(char *)));
|
||
|
+ if (cpuset_list == NULL) {
|
||
|
+ numad_log(LOG_CRIT, "realloc failed\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ // Configure this subtask cpuset and, if successful, save a
|
||
|
+ // copy of the name in the unique cpuset list.
|
||
|
+ if (configure_cpuset(fname, node_list_buf, cpu_bind_list_buf) < 0) {
|
||
|
+ numad_log(LOG_WARNING, "Could not configure cpuset %s. Assuming thread completed.\n", fname);
|
||
|
+ free(namelist[ix]);
|
||
|
+ continue;
|
||
|
+ } else {
|
||
|
+ cpuset_list[num_names++] = strdup(fname);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ } else {
|
||
|
+ // This task ID has the default cpuset name. Just add this task ID to the main PID cpuset.
|
||
|
+ numad_log(LOG_NOTICE, "Including task: %s in cpuset: %s\n", namelist[ix]->d_name, pid_cpuset_name);
|
||
|
+ if (write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)) <= 0) {
|
||
|
+ numad_log(LOG_WARNING, "Could not write to cpuset: %s -- errno: %d\n", pid_cpuset_name, errno);
|
||
|
+ free(namelist[ix]);
|
||
|
+ continue; // Assuming thread completed.
|
||
|
+ }
|
||
|
+ }
|
||
|
+ free(namelist[ix]);
|
||
|
+ }
|
||
|
+ // Done with subtask unique cpuset names for this PID. Free them.
|
||
|
+ for (int ix = 0; (ix < num_names); ix++) {
|
||
|
+ free(cpuset_list[ix]);
|
||
|
+ }
|
||
|
}
|
||
|
free(namelist);
|
||
|
close(fd);
|
||
|
-
|
||
|
- uint64_t t1 = get_time_stamp();
|
||
|
// Check pid still active
|
||
|
- snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
|
||
|
+ snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
|
||
|
if (access(fname, F_OK) < 0) {
|
||
|
- numad_log(LOG_WARNING, "Could not migrate pid\n");
|
||
|
- return 0; // Assume the process terminated
|
||
|
+ numad_log(LOG_WARNING, "Could not migrate pid %d\n", p->pid);
|
||
|
+ return 0;
|
||
|
+ } else {
|
||
|
+ uint64_t t1 = get_time_stamp();
|
||
|
+ p->bind_time_stamp = t1;
|
||
|
+ numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
|
||
|
+ return 1;
|
||
|
}
|
||
|
- numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
|
||
|
- return 1;
|
||
|
}
|
||
|
|
||
|
|
||
|
void show_nodes() {
|
||
|
- time_t ts = time(NULL);
|
||
|
- fprintf(log_fs, "%s", ctime(&ts));
|
||
|
- fprintf(log_fs, "Nodes: %d\n", num_nodes);
|
||
|
+ fprintf(log_fs, "\n");
|
||
|
+ numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
|
||
|
+ fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n",
|
||
|
+ min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
|
||
|
+ fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n",
|
||
|
+ min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
|
||
|
for (int ix = 0; (ix < num_nodes); ix++) {
|
||
|
fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ",
|
||
|
ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
|
||
|
@@ -1049,7 +1317,6 @@ void show_nodes() {
|
||
|
str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
|
||
|
fprintf(log_fs, " CPUs: %s\n", buf);
|
||
|
}
|
||
|
- fprintf(log_fs, "\n");
|
||
|
fflush(log_fs);
|
||
|
}
|
||
|
|
||
|
@@ -1065,7 +1332,7 @@ int cur_cpu_data_buf = 0;
|
||
|
|
||
|
void update_cpu_data() {
|
||
|
// Parse idle percents from CPU stats in /proc/stat cpu<N> lines
|
||
|
- static FILE *fs = NULL;
|
||
|
+ static FILE *fs;
|
||
|
if (fs != NULL) {
|
||
|
rewind(fs);
|
||
|
} else {
|
||
|
@@ -1107,7 +1374,8 @@ void update_cpu_data() {
|
||
|
while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice
|
||
|
while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system
|
||
|
while (!isdigit(*p)) { p++; }
|
||
|
- uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
|
||
|
+ uint64_t idle;
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, idle);
|
||
|
cpu_data_buf[new].idle[cpu_id] = idle;
|
||
|
}
|
||
|
}
|
||
|
@@ -1129,10 +1397,6 @@ int node_and_digits(const struct dirent
|
||
|
}
|
||
|
|
||
|
|
||
|
-id_list_p all_cpus_list_p = NULL;
|
||
|
-id_list_p all_nodes_list_p = NULL;
|
||
|
-uint64_t node_info_time_stamp = 0;
|
||
|
-
|
||
|
|
||
|
int update_nodes() {
|
||
|
char fname[FNAME_SIZE];
|
||
|
@@ -1141,6 +1405,7 @@ int update_nodes() {
|
||
|
uint64_t time_stamp = get_time_stamp();
|
||
|
#define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
|
||
|
if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
|
||
|
+ node_info_time_stamp = time_stamp;
|
||
|
// Count directory names of the form: /sys/devices/system/node/node<N>
|
||
|
struct dirent **namelist;
|
||
|
int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
|
||
|
@@ -1167,8 +1432,15 @@ int update_nodes() {
|
||
|
}
|
||
|
num_nodes = num_files;
|
||
|
}
|
||
|
- CLEAR_LIST(all_cpus_list_p);
|
||
|
- CLEAR_LIST(all_nodes_list_p);
|
||
|
+ sum_CPUs_total = 0;
|
||
|
+ CLEAR_CPU_LIST(all_cpus_list_p);
|
||
|
+ CLEAR_NODE_LIST(all_nodes_list_p);
|
||
|
+ // Figure out how many threads per core there are (for later discounting of hyper-threads)
|
||
|
+ threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
|
||
|
+ if (threads_per_core < 1) {
|
||
|
+ numad_log(LOG_CRIT, "Could not count threads per core\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
// For each "node<N>" filename present, save <N> in node[ix].node_id
|
||
|
// Note that the node id might not necessarily match the node ix.
|
||
|
// Also populate the cpu lists and distance vectors for this node.
|
||
|
@@ -1185,10 +1457,22 @@ int update_nodes() {
|
||
|
int fd = open(fname, O_RDONLY, 0);
|
||
|
if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
||
|
// get cpulist from the cpulist string
|
||
|
- CLEAR_LIST(node[node_ix].cpu_list_p);
|
||
|
+ CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
|
||
|
int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
|
||
|
+ if (reserved_cpu_str != NULL) {
|
||
|
+ AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
|
||
|
+ n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
|
||
|
+ }
|
||
|
OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
|
||
|
- node[node_ix].CPUs_total = n * ONE_HUNDRED;
|
||
|
+ // Calculate total CPUs, but possibly discount hyper-threads
|
||
|
+ if ((threads_per_core == 1) || (htt_percent >= 100)) {
|
||
|
+ node[node_ix].CPUs_total = n * ONE_HUNDRED;
|
||
|
+ } else {
|
||
|
+ n /= threads_per_core;
|
||
|
+ node[node_ix].CPUs_total = n * ONE_HUNDRED;
|
||
|
+ node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
|
||
|
+ }
|
||
|
+ sum_CPUs_total += node[node_ix].CPUs_total;
|
||
|
close(fd);
|
||
|
} else {
|
||
|
numad_log(LOG_CRIT, "Could not get node cpu list\n");
|
||
|
@@ -1220,14 +1504,28 @@ int update_nodes() {
|
||
|
}
|
||
|
free(namelist);
|
||
|
}
|
||
|
- // Second, get the dynamic free memory and available CPU capacity
|
||
|
+ // Second, update the dynamic free memory and available CPU capacity
|
||
|
+ while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
|
||
|
+ // Make sure at least 7/100 of a second has passed.
|
||
|
+ // Otherwise sleep for 1/10 second.
|
||
|
+ struct timespec ts = { 0, 100000000 };
|
||
|
+ nanosleep(&ts, &ts);
|
||
|
+ time_stamp = get_time_stamp();
|
||
|
+ }
|
||
|
update_cpu_data();
|
||
|
+ max_node_MBs_free = 0;
|
||
|
+ max_node_CPUs_free = 0;
|
||
|
+ min_node_MBs_free = MAXINT;
|
||
|
+ min_node_CPUs_free = MAXINT;
|
||
|
+ uint64_t sum_of_node_MBs_free = 0;
|
||
|
+ uint64_t sum_of_node_CPUs_free = 0;
|
||
|
for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
||
|
int node_id = node[node_ix].node_id;
|
||
|
// Get available memory info from node<N>/meminfo file
|
||
|
snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
|
||
|
int fd = open(fname, O_RDONLY, 0);
|
||
|
if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
||
|
+ close(fd);
|
||
|
uint64_t KB;
|
||
|
char *p = strstr(buf, "MemTotal:");
|
||
|
if (p != NULL) {
|
||
|
@@ -1238,7 +1536,7 @@ int update_nodes() {
|
||
|
}
|
||
|
while (!isdigit(*p)) { p++; }
|
||
|
CONVERT_DIGITS_TO_NUM(p, KB);
|
||
|
- node[node_ix].MBs_total = KB / KILOBYTE;
|
||
|
+ node[node_ix].MBs_total = (KB / KILOBYTE);
|
||
|
p = strstr(p, "MemFree:");
|
||
|
if (p != NULL) {
|
||
|
p += 8;
|
||
|
@@ -1248,8 +1546,27 @@ int update_nodes() {
|
||
|
}
|
||
|
while (!isdigit(*p)) { p++; }
|
||
|
CONVERT_DIGITS_TO_NUM(p, KB);
|
||
|
- node[node_ix].MBs_free = KB / KILOBYTE;
|
||
|
- close(fd);
|
||
|
+ node[node_ix].MBs_free = (KB / KILOBYTE);
|
||
|
+ if (use_inactive_file_cache) {
|
||
|
+ // Add inactive file cache quantity to "free" memory
|
||
|
+ p = strstr(p, "Inactive(file):");
|
||
|
+ if (p != NULL) {
|
||
|
+ p += 15;
|
||
|
+ } else {
|
||
|
+ numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ while (!isdigit(*p)) { p++; }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, KB);
|
||
|
+ node[node_ix].MBs_free += (KB / KILOBYTE);
|
||
|
+ }
|
||
|
+ sum_of_node_MBs_free += node[node_ix].MBs_free;
|
||
|
+ if (min_node_MBs_free > node[node_ix].MBs_free) {
|
||
|
+ min_node_MBs_free = node[node_ix].MBs_free;
|
||
|
+ }
|
||
|
+ if (max_node_MBs_free < node[node_ix].MBs_free) {
|
||
|
+ max_node_MBs_free = node[node_ix].MBs_free;
|
||
|
+ }
|
||
|
} else {
|
||
|
numad_log(LOG_CRIT, "Could not get node meminfo\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
@@ -1260,7 +1577,8 @@ int update_nodes() {
|
||
|
if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
|
||
|
uint64_t idle_ticks = 0;
|
||
|
int cpu = 0;
|
||
|
- int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
|
||
|
+ int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
|
||
|
+ int num_cpus_to_process = num_lcpus;
|
||
|
while (num_cpus_to_process) {
|
||
|
if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
|
||
|
idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
|
||
|
@@ -1274,15 +1592,45 @@ int update_nodes() {
|
||
|
// printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
|
||
|
// assert(time_diff > 0);
|
||
|
node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
|
||
|
+ // Possibly discount hyper-threads
|
||
|
+ if ((threads_per_core > 1) && (htt_percent < 100)) {
|
||
|
+ uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
|
||
|
+ if (node[node_ix].CPUs_free > htt_discount) {
|
||
|
+ node[node_ix].CPUs_free -= htt_discount;
|
||
|
+ } else {
|
||
|
+ node[node_ix].CPUs_free = 0;
|
||
|
+ }
|
||
|
+ }
|
||
|
if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
|
||
|
node[node_ix].CPUs_free = node[node_ix].CPUs_total;
|
||
|
}
|
||
|
+ sum_of_node_CPUs_free += node[node_ix].CPUs_free;
|
||
|
+ if (min_node_CPUs_free > node[node_ix].CPUs_free) {
|
||
|
+ min_node_CPUs_free = node[node_ix].CPUs_free;
|
||
|
+ }
|
||
|
+ if (max_node_CPUs_free < node[node_ix].CPUs_free) {
|
||
|
+ max_node_CPUs_free = node[node_ix].CPUs_free;
|
||
|
+ }
|
||
|
node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
|
||
|
} else {
|
||
|
node[node_ix].CPUs_free = 0;
|
||
|
node[node_ix].magnitude = 0;
|
||
|
}
|
||
|
}
|
||
|
+ avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
|
||
|
+ avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
|
||
|
+ double MBs_variance_sum = 0.0;
|
||
|
+ double CPUs_variance_sum = 0.0;
|
||
|
+ for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
||
|
+ double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
|
||
|
+ double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
|
||
|
+ MBs_variance_sum += MBs_diff * MBs_diff;
|
||
|
+ CPUs_variance_sum += CPUs_diff * CPUs_diff;
|
||
|
+ }
|
||
|
+ double MBs_variance = MBs_variance_sum / (num_nodes);
|
||
|
+ double CPUs_variance = CPUs_variance_sum / (num_nodes);
|
||
|
+ stddev_node_MBs_free = sqrt(MBs_variance);
|
||
|
+ stddev_node_CPUs_free = sqrt(CPUs_variance);
|
||
|
if (log_level >= LOG_INFO) {
|
||
|
show_nodes();
|
||
|
}
|
||
|
@@ -1316,7 +1664,7 @@ typedef struct stat_data {
|
||
|
int64_t num_threads; // 19
|
||
|
int64_t itrealvalue;
|
||
|
uint64_t starttime;
|
||
|
- uint64_t vsize;
|
||
|
+ uint64_t vsize; // 22
|
||
|
int64_t rss; // 23
|
||
|
uint64_t rsslim;
|
||
|
uint64_t startcode;
|
||
|
@@ -1361,10 +1709,11 @@ process_data_p get_stat_data_for_pid(int
|
||
|
return NULL;
|
||
|
}
|
||
|
close(fd);
|
||
|
+ uint64_t val;
|
||
|
char *p = buf;
|
||
|
static process_data_t data;
|
||
|
// Get PID from field 0
|
||
|
- uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
data.pid = val;
|
||
|
// Copy comm from field 1
|
||
|
while (*p == ' ') { p++; }
|
||
|
@@ -1373,23 +1722,27 @@ process_data_p get_stat_data_for_pid(int
|
||
|
// Skip fields 2 through 12
|
||
|
for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
||
|
// Get utime from field 13 for cpu_util
|
||
|
- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
data.cpu_util = val;
|
||
|
// Get stime from field 14 to add on to cpu_util (which already has utime)
|
||
|
while (*p == ' ') { p++; }
|
||
|
- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
data.cpu_util += val;
|
||
|
// Skip fields 15 through 18
|
||
|
while (*p == ' ') { p++; }
|
||
|
for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
||
|
// Get num_threads from field 19
|
||
|
- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
data.num_threads = val;
|
||
|
- // Skip fields 20 through 22
|
||
|
+ // Skip fields 20 through 21
|
||
|
while (*p == ' ') { p++; }
|
||
|
- for (int ix = 0; (ix < 3); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
||
|
+ for (int ix = 0; (ix < 2); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
||
|
+ // Get vsize from field 22 to compute MBs_size
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
+ data.MBs_size = val / MEGABYTE;
|
||
|
// Get rss from field 23 to compute MBs_used
|
||
|
- val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
|
||
|
+ while (*p == ' ') { p++; }
|
||
|
+ CONVERT_DIGITS_TO_NUM(p, val);
|
||
|
data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
|
||
|
// Return pointer to data
|
||
|
return &data;
|
||
|
@@ -1471,20 +1824,79 @@ int update_processes() {
|
||
|
}
|
||
|
|
||
|
|
||
|
+int initialize_mem_node_list(process_data_p p) {
|
||
|
+ // Parameter p is a pointer to an element in the hash table
|
||
|
+ if ((!p) || (p->pid < 1)) {
|
||
|
+ numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ int n = 0;
|
||
|
+ char fname[FNAME_SIZE];
|
||
|
+ char buf[BIG_BUF_SIZE];
|
||
|
+ CLEAR_NODE_LIST(p->node_list_p);
|
||
|
+ snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
|
||
|
+ int fd = open(fname, O_RDONLY, 0);
|
||
|
+ if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
||
|
+ close(fd);
|
||
|
+ char *list_str_p = strstr(buf, "Mems_allowed_list:");
|
||
|
+ if (!list_str_p) {
|
||
|
+ numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ list_str_p += 18;
|
||
|
+ while (!isdigit(*list_str_p)) { list_str_p++; }
|
||
|
+ n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
|
||
|
+ } else {
|
||
|
+ numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
|
||
|
+ return 0; // Assume the process terminated
|
||
|
+ }
|
||
|
+ if (n < num_nodes) {
|
||
|
+ // If process already bound to a subset of nodes when we discover it,
|
||
|
+ // set initial bind_time_stamp to 30 minutes ago...
|
||
|
+ p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
|
||
|
+ }
|
||
|
+ return n;
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
|
||
|
-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
|
||
|
- char buf[BUF_SIZE];
|
||
|
- char buf2[BUF_SIZE];
|
||
|
+
|
||
|
+uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
|
||
|
+ int64_t needed_mem;
|
||
|
+ int64_t needed_cpu;
|
||
|
+ int64_t excess_mem;
|
||
|
+ int64_t excess_cpu;
|
||
|
+ if (MBs_free > mbs) {
|
||
|
+ needed_mem = mbs;
|
||
|
+ excess_mem = MBs_free - mbs;
|
||
|
+ } else {
|
||
|
+ needed_mem = MBs_free;
|
||
|
+ excess_mem = 0;
|
||
|
+ }
|
||
|
+ if (CPUs_free > cpus) {
|
||
|
+ needed_cpu = cpus;
|
||
|
+ excess_cpu = CPUs_free - cpus;
|
||
|
+ } else {
|
||
|
+ needed_cpu = CPUs_free;
|
||
|
+ excess_cpu = 0;
|
||
|
+ }
|
||
|
+ // Weight the available resources, and then calculate magnitude as
|
||
|
+ // product of available CPUs and available MBs.
|
||
|
+ int64_t memfactor = (needed_mem * 10 + excess_mem * 3);
|
||
|
+ int64_t cpufactor = (needed_cpu * 8 + excess_cpu * 1);
|
||
|
+ numad_log(LOG_DEBUG, " Node[%d]: mem: %ld cpu: %ld\n", ix, memfactor, cpufactor);
|
||
|
+ return (memfactor * cpufactor);
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
+id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
|
||
|
if (log_level >= LOG_DEBUG) {
|
||
|
numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs);
|
||
|
}
|
||
|
- int num_existing_mems = 0;
|
||
|
- static id_list_p existing_mems_list_p;
|
||
|
- CLEAR_LIST(existing_mems_list_p);
|
||
|
- uint64_t time_stamp = get_time_stamp();
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ uint64_t process_CPUs = 0;
|
||
|
static node_data_p tmp_node;
|
||
|
static uint64_t *process_MBs;
|
||
|
- static uint64_t *saved_magnitude_for_node;
|
||
|
static int process_MBs_num_nodes;
|
||
|
// See if dynamic structures need to grow.
|
||
|
if (process_MBs_num_nodes < num_nodes + 1) {
|
||
|
@@ -1492,121 +1904,25 @@ id_list_p pick_numa_nodes(int pid, int c
|
||
|
// The "+1 node" is for accumulating interleaved memory
|
||
|
process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
|
||
|
tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
|
||
|
- saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
|
||
|
- if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
|
||
|
+ if ((process_MBs == NULL) || (tmp_node == NULL)) {
|
||
|
numad_log(LOG_CRIT, "process_MBs realloc failed\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
+
|
||
|
// For existing processes, get miscellaneous process specific details
|
||
|
int pid_ix;
|
||
|
process_data_p p = NULL;
|
||
|
if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
|
||
|
p = &process_hash_table[pid_ix];
|
||
|
- // Quick rejection if this process has interleaved memory, but recheck it once an hour...
|
||
|
-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
|
||
|
- if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
|
||
|
- && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
|
||
|
- }
|
||
|
- return NULL;
|
||
|
- }
|
||
|
- // Get cpuset name for this process, and existing mems binding, if any.
|
||
|
- char fname[FNAME_SIZE];
|
||
|
- snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
|
||
|
- FILE *fs = fopen(fname, "r");
|
||
|
- if (!fs) {
|
||
|
- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
|
||
|
- return NULL; // Assume the process terminated?
|
||
|
- }
|
||
|
- if (!fgets(buf, BUF_SIZE, fs)) {
|
||
|
- numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
|
||
|
- fclose(fs);
|
||
|
- return NULL; // Assume the process terminated?
|
||
|
- }
|
||
|
- fclose(fs);
|
||
|
- ELIM_NEW_LINE(buf);
|
||
|
- if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
|
||
|
- if (p->cpuset_name != NULL) {
|
||
|
- free(p->cpuset_name);
|
||
|
- }
|
||
|
- p->cpuset_name = strdup(buf);
|
||
|
- }
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
|
||
|
- }
|
||
|
- snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
|
||
|
- fs = fopen(fname, "r");
|
||
|
- if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
|
||
|
- fclose(fs);
|
||
|
- num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
|
||
|
- numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
|
||
|
- }
|
||
|
- }
|
||
|
- // If this process was just recently bound, enforce a minimum delay
|
||
|
- // period between repeated attempts to potentially move the memory.
|
||
|
- // FIXME: ?? might this retard appropriate process expansion too much?
|
||
|
-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
|
||
|
- if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
|
||
|
- // Skip re-evaluation because we just did it recently.
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
|
||
|
- }
|
||
|
- return NULL;
|
||
|
- }
|
||
|
- // Look for short cut because of duplicate bindings. If we have bound
|
||
|
- // this process to the same nodes multiple times already, and the load
|
||
|
- // on those nodes still seems acceptable, skip the rest of this and
|
||
|
- // just return NULL to indicate no change needed. FIXME: should figure
|
||
|
- // out what can change that would make a rebinding desirable (e.g. (1)
|
||
|
- // some process gets sub-optimal allocation on busy machine which
|
||
|
- // subsequently becomes less busy leaving disadvantaged process. (2)
|
||
|
- // node load imbalance, (3) any process split across nodes which should
|
||
|
- // fit within a single node.) For now, just expire the dup_bid_count
|
||
|
- // occasionally, which is a reasonably good mitigation.
|
||
|
- // So, check to see if we should decay the dup_bind_count...
|
||
|
-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
|
||
|
- if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
|
||
|
- p->dup_bind_count -= 1;
|
||
|
- }
|
||
|
- // Now, look for short cut because of duplicate bindings
|
||
|
- if (p->dup_bind_count > 0) {
|
||
|
- int node_id = 0;
|
||
|
- int nodes_have_cpu = 1;
|
||
|
- int nodes_have_ram = 1;
|
||
|
- int n = num_existing_mems;
|
||
|
- int min_resource_pct = 100 - target_utilization;
|
||
|
- if (min_resource_pct < 5) {
|
||
|
- min_resource_pct = 5;
|
||
|
- }
|
||
|
- while (n) {
|
||
|
- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
|
||
|
- nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
|
||
|
- nodes_have_ram &= ((100 * node[node_id].MBs_free / node[node_id].MBs_total) >= (min_resource_pct));
|
||
|
- n -= 1;
|
||
|
- }
|
||
|
- node_id += 1;
|
||
|
- }
|
||
|
- if ((nodes_have_cpu) && (nodes_have_ram)) {
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
|
||
|
- }
|
||
|
- return NULL;
|
||
|
- }
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
|
||
|
- }
|
||
|
- }
|
||
|
- // Fourth, add up per-node memory in use by this process. This scanning
|
||
|
- // is expensive and should be minimized. Also, old kernels dismantle
|
||
|
- // transparent huge pages while producing the numa_maps memory
|
||
|
- // information!
|
||
|
+ // Correct current CPUs amount for utilization factor inflation
|
||
|
+ process_CPUs = (cpus * target_utilization) / 100;
|
||
|
+ // Add up per-node memory in use by this process.
|
||
|
+ // This scanning is expensive and should be minimized.
|
||
|
memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
|
||
|
+ char fname[FNAME_SIZE];
|
||
|
snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
|
||
|
- fs = fopen(fname, "r");
|
||
|
+ FILE *fs = fopen(fname, "r");
|
||
|
if (!fs) {
|
||
|
numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
|
||
|
return NULL; // Assume the process terminated
|
||
|
@@ -1645,84 +1961,103 @@ id_list_p pick_numa_nodes(int pid, int c
|
||
|
fclose(fs);
|
||
|
for (int ix = 0; (ix <= num_nodes); ix++) {
|
||
|
process_MBs[ix] /= MEGABYTE;
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
|
||
|
+ if (p->bind_time_stamp) {
|
||
|
+ if ((process_MBs[ix]) && (!ID_IS_IN_LIST(ix, p->node_list_p))) {
|
||
|
+ // FIXME: If process previously bound, but memory appears
|
||
|
+ // to exist where it should not, this might identify
|
||
|
+ // processes for which the kernel does not move all the
|
||
|
+ // memory for whatever reason.... Must check for
|
||
|
+ // significant amount before doing anything about it,
|
||
|
+ // however, since memory for libraries, etc, can get moved
|
||
|
+ // around.
|
||
|
+ }
|
||
|
+ } else {
|
||
|
+ // If process has not yet been bound, set node list to existing nodes with memory
|
||
|
+ if (process_MBs[ix]) {
|
||
|
+ ADD_ID_TO_LIST(ix, p->node_list_p);
|
||
|
+ } else {
|
||
|
+ CLR_ID_IN_LIST(ix, p->node_list_p);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ if ((log_level >= LOG_DEBUG) && (process_MBs[ix] > 0)) {
|
||
|
+ if (ix == num_nodes) {
|
||
|
+ numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, process_MBs[ix]);
|
||
|
+ } else {
|
||
|
+ numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
|
||
|
+ }
|
||
|
}
|
||
|
}
|
||
|
if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
|
||
|
// Mark this process as having interleaved memory so we do not
|
||
|
- // merge the interleaved memory. Time stamp it as done.
|
||
|
+ // merge the interleaved memory. Time stamp it as done and return.
|
||
|
p->flags |= PROCESS_FLAG_INTERLEAVED;
|
||
|
p->bind_time_stamp = get_time_stamp();
|
||
|
if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
|
||
|
+ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
} // end of existing PID conditional
|
||
|
+
|
||
|
// Make a copy of node available resources array. Add in info specific to
|
||
|
// this process to equalize available resource quantities wrt locations of
|
||
|
- // resources already in use by this process. Inflate the value of already
|
||
|
- // assigned memory by approximately 3/2, because moving memory is
|
||
|
- // expensive. Average the amount of CPUs_free across the existing nodes
|
||
|
- // used, because the threads are free to move around in that domain. After
|
||
|
- // calculating combined magnitude of available resources, bias the values
|
||
|
- // towards existing locations for this process.
|
||
|
- int target_using_all_nodes = 0;
|
||
|
- uint64_t node_CPUs_free_for_this_process = 0;
|
||
|
+ // resources already in use by this process. After calculating weighted
|
||
|
+ // magnitude of available resources, bias the values towards existing
|
||
|
+ // locations for this process.
|
||
|
memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
|
||
|
- if (num_existing_mems > 0) {
|
||
|
- node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
|
||
|
- int node_id = 0;
|
||
|
- int n = num_existing_mems;
|
||
|
- while (n) {
|
||
|
- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
|
||
|
- node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
|
||
|
- n -= 1;
|
||
|
- }
|
||
|
- node_id += 1;
|
||
|
- }
|
||
|
- // Divide to get average CPUs_free for the nodes in use by process
|
||
|
- node_CPUs_free_for_this_process /= num_existing_mems;
|
||
|
- }
|
||
|
for (int ix = 0; (ix < num_nodes); ix++) {
|
||
|
- if (pid > 0) {
|
||
|
- tmp_node[ix].MBs_free += ((process_MBs[ix] * 12) / 8);
|
||
|
- }
|
||
|
- if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
|
||
|
- tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
|
||
|
+ // Add back (biased) memory already used by this process on this node
|
||
|
+ tmp_node[ix].MBs_free += ((process_MBs[ix] * 8) / 8); // FIXME: apply bias here?
|
||
|
+ if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
|
||
|
+ tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
|
||
|
+ }
|
||
|
+ // Add back CPU in proportion to amount of memory already used on this
|
||
|
+ // node Making assumption here that CPU execution threads are actually
|
||
|
+ // running on the same nodes where memory is assigned... FIXME: should
|
||
|
+ // we perhaps do this only if process already explicitly bound?
|
||
|
+ uint64_t prorated_CPU = (process_CPUs * process_MBs[ix]) / mbs;
|
||
|
+ if ((log_level >= LOG_DEBUG) && (prorated_CPU > 0)) {
|
||
|
+ numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, prorated_CPU);
|
||
|
}
|
||
|
+ tmp_node[ix].CPUs_free += prorated_CPU;
|
||
|
if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
|
||
|
tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
|
||
|
}
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
|
||
|
+ if (tmp_node[ix].CPUs_free < 1) {
|
||
|
+ // enforce 1/100th CPU minimum
|
||
|
+ tmp_node[ix].CPUs_free = 1;
|
||
|
}
|
||
|
- // Calculate magnitude as product of available CPUs and available MBs
|
||
|
- tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
|
||
|
+ // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
|
||
|
+ tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
|
||
|
// Bias combined magnitude towards already assigned nodes
|
||
|
- if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
|
||
|
- tmp_node[ix].magnitude *= 9;
|
||
|
- tmp_node[ix].magnitude /= 8;
|
||
|
+ if ((pid > 0) && (ID_IS_IN_LIST(ix, p->node_list_p))) {
|
||
|
+ tmp_node[ix].magnitude *= 17;
|
||
|
+ tmp_node[ix].magnitude /= 16;
|
||
|
}
|
||
|
- // Save the current magnitudes
|
||
|
- saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
|
||
|
}
|
||
|
- // OK, figure out where to get resources for this request.
|
||
|
- static id_list_p target_node_list_p;
|
||
|
- CLEAR_LIST(target_node_list_p);
|
||
|
+
|
||
|
+ // Figure out where to get resources for this request.
|
||
|
int prev_node_used = -1;
|
||
|
- // Continue to allocate more resources until request are met.
|
||
|
- // OK if not not quite all the CPU request is met.
|
||
|
- // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
|
||
|
- int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200;
|
||
|
- if (pid <= 0) {
|
||
|
- // If trying to find resources for pre-placement advice request, do not
|
||
|
- // underestimate the amount of CPUs needed. Instead, err on the side
|
||
|
- // of providing too many resources. So, no flexing here...
|
||
|
- cpu_flex = 0;
|
||
|
+ static id_list_p target_node_list_p;
|
||
|
+ CLEAR_NODE_LIST(target_node_list_p);
|
||
|
+ // Establish a CPU flex fudge factor, on the presumption it is OK if not
|
||
|
+ // quite all the CPU request is met. However, if trying to find resources
|
||
|
+ // for pre-placement advice request, do not underestimate the amount of
|
||
|
+ // CPUs needed. Instead, err on the side of providing too many resources.
|
||
|
+ int cpu_flex = 0;
|
||
|
+ if ((pid > 0) && (target_utilization < 100)) {
|
||
|
+ // FIXME: Is half of the utilization margin a good amount of CPU flexing?
|
||
|
+ cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200;
|
||
|
+ }
|
||
|
+ // Figure out minimum number of nodes required
|
||
|
+ int mem_req_nodes = ceil((double)mbs / (double)node[0].MBs_total);
|
||
|
+ int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total);
|
||
|
+ int min_req_nodes = mem_req_nodes;
|
||
|
+ if (min_req_nodes < cpu_req_nodes) {
|
||
|
+ min_req_nodes = cpu_req_nodes;
|
||
|
}
|
||
|
- while ((mbs > 0) || (cpus > cpu_flex)) {
|
||
|
+ // Continue to allocate more resources until request are met.
|
||
|
+ while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
|
||
|
if (log_level >= LOG_DEBUG) {
|
||
|
numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus);
|
||
|
}
|
||
|
@@ -1760,22 +2095,18 @@ id_list_p pick_numa_nodes(int pid, int c
|
||
|
// last one we used. This is not going to make progress... So
|
||
|
// just punt and use everything.
|
||
|
OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
|
||
|
- target_using_all_nodes = 1;
|
||
|
break;
|
||
|
}
|
||
|
prev_node_used = tmp_node[0].node_id;
|
||
|
ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
|
||
|
- str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
|
||
|
- numad_log(LOG_DEBUG, "Existing nodes: %s Target nodes: %s\n", buf, buf2);
|
||
|
- }
|
||
|
+ min_req_nodes -= 1;
|
||
|
if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
|
||
|
// Apparently we must use all resource nodes...
|
||
|
- target_using_all_nodes = 1;
|
||
|
break;
|
||
|
}
|
||
|
-#define MBS_MARGIN 10
|
||
|
+ // "Consume" the resources on this node
|
||
|
+#define CPUS_MARGIN 0
|
||
|
+#define MBS_MARGIN 100
|
||
|
if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
|
||
|
tmp_node[0].MBs_free -= mbs;
|
||
|
mbs = 0;
|
||
|
@@ -1783,7 +2114,6 @@ id_list_p pick_numa_nodes(int pid, int c
|
||
|
mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
|
||
|
tmp_node[0].MBs_free = MBS_MARGIN;
|
||
|
}
|
||
|
-#define CPUS_MARGIN 0
|
||
|
if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
|
||
|
tmp_node[0].CPUs_free -= cpus;
|
||
|
cpus = 0;
|
||
|
@@ -1791,126 +2121,52 @@ id_list_p pick_numa_nodes(int pid, int c
|
||
|
cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
|
||
|
tmp_node[0].CPUs_free = CPUS_MARGIN;
|
||
|
}
|
||
|
- tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
|
||
|
+ tmp_node[0].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[0].MBs_free, tmp_node[0].CPUs_free);
|
||
|
}
|
||
|
- // If this existing process is already located where we want it, and almost
|
||
|
- // all memory is already moved to those nodes, then return NULL indicating
|
||
|
- // no need to change binding this time.
|
||
|
- if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
|
||
|
- // May not need to change binding. However, if there is any significant
|
||
|
- // memory still on non-target nodes, advise the bind anyway because
|
||
|
- // there are some scenarios when the kernel will not move it all the
|
||
|
- // first time.
|
||
|
- if (!target_using_all_nodes) {
|
||
|
- p->dup_bind_count += 1;
|
||
|
- for (int ix = 0; (ix < num_nodes); ix++) {
|
||
|
- if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
|
||
|
- goto try_memory_move_again;
|
||
|
- }
|
||
|
- }
|
||
|
- // We will accept these memory locations. Stamp it as done.
|
||
|
- p->bind_time_stamp = get_time_stamp();
|
||
|
- }
|
||
|
- // Skip rebinding either because practically all memory is in the
|
||
|
- // target nodes, or because we are stuck using all the nodes.
|
||
|
+
|
||
|
+ // If this existing process is already located where we want it, then just
|
||
|
+ // return NULL indicating no need to change binding this time.
|
||
|
+ if ((pid > 0) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
|
||
|
if (log_level >= LOG_DEBUG) {
|
||
|
- numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
|
||
|
+ numad_log(LOG_DEBUG, "Process %d already bound to target nodes.\n", p->pid);
|
||
|
}
|
||
|
+ p->bind_time_stamp = get_time_stamp();
|
||
|
return NULL;
|
||
|
- } else {
|
||
|
- // Either a non-existing process, or a new binding for an existing process.
|
||
|
- if (p != NULL) {
|
||
|
- // Must be a new binding for an existing process, so reset dup_bind_count.
|
||
|
- p->dup_bind_count = 0;
|
||
|
- }
|
||
|
- }
|
||
|
- // See if this proposed move will make a significant difference.
|
||
|
- // If not, return null instead of advising the move.
|
||
|
- uint64_t target_magnitude = 0;
|
||
|
- uint64_t existing_magnitude = 0;
|
||
|
- int num_target_nodes = NUM_IDS_IN_LIST(target_node_list_p);
|
||
|
- int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
|
||
|
- /* FIXME: this expansion seems to cause excessive growth
|
||
|
- * So calculate the improvement before hastily expanding nodes.
|
||
|
- if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
|
||
|
- */
|
||
|
- int node_id = 0;
|
||
|
- int n = num_existing_nodes + num_target_nodes;
|
||
|
- while (n) {
|
||
|
- if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
|
||
|
- target_magnitude += saved_magnitude_for_node[node_id];
|
||
|
- n -= 1;
|
||
|
- }
|
||
|
- if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
|
||
|
- existing_magnitude += saved_magnitude_for_node[node_id];
|
||
|
- n -= 1;
|
||
|
- }
|
||
|
- node_id += 1;
|
||
|
- }
|
||
|
- if (existing_magnitude > 0) {
|
||
|
- uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
|
||
|
- if (magnitude_change < 0) {
|
||
|
- magnitude_change = -(magnitude_change);
|
||
|
- }
|
||
|
- if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
|
||
|
- // Not significant enough percentage change to do rebind
|
||
|
- if (log_level >= LOG_DEBUG) {
|
||
|
- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
|
||
|
- str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
|
||
|
- numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
|
||
|
- pid, buf, buf2, magnitude_change);
|
||
|
- }
|
||
|
- // We decided this is almost good enough. Stamp it as done.
|
||
|
- p->bind_time_stamp = get_time_stamp();
|
||
|
- return NULL;
|
||
|
- }
|
||
|
}
|
||
|
- if ((pid <= 0) && (num_target_nodes <= 0)) {
|
||
|
- // Always provide at least one node for pre-placement advice
|
||
|
+ // Must always provide at least one node for pre-placement advice
|
||
|
+ // FIXME: verify this can happen only if no resources requested...
|
||
|
+ if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
|
||
|
ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
|
||
|
}
|
||
|
-try_memory_move_again:
|
||
|
- str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
|
||
|
+ // Log advice, and return target node list
|
||
|
+ if ((pid > 0) && (p->bind_time_stamp)) {
|
||
|
+ str_from_id_list(buf, BUF_SIZE, p->node_list_p);
|
||
|
+ } else {
|
||
|
+ str_from_id_list(buf, BUF_SIZE, all_nodes_list_p);
|
||
|
+ }
|
||
|
+ char buf2[BUF_SIZE];
|
||
|
str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
|
||
|
char *cmd_name = "(unknown)";
|
||
|
if ((p) && (p->comm)) {
|
||
|
cmd_name = p->comm;
|
||
|
}
|
||
|
numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
|
||
|
+
|
||
|
+ if (pid > 0) {
|
||
|
+ // FIXME: Consider moving this out to caller??
|
||
|
+ COPY_LIST(target_node_list_p, p->node_list_p);
|
||
|
+ }
|
||
|
return target_node_list_p;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
-void show_processes(process_data_p *ptr, int nprocs) {
|
||
|
- time_t ts = time(NULL);
|
||
|
- fprintf(log_fs, "%s", ctime(&ts));
|
||
|
- fprintf(log_fs, "Candidates: %d\n", nprocs);
|
||
|
- for (int ix = 0; (ix < nprocs); ix++) {
|
||
|
- process_data_p p = ptr[ix];
|
||
|
- char buf[BUF_SIZE];
|
||
|
- snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
|
||
|
- FILE *fs = fopen(buf, "r");
|
||
|
- buf[0] = '\0';
|
||
|
- if (fs) {
|
||
|
- if (fgets(buf, BUF_SIZE, fs)) {
|
||
|
- ELIM_NEW_LINE(buf);
|
||
|
- }
|
||
|
- fclose(fs);
|
||
|
- }
|
||
|
- fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
|
||
|
- p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
|
||
|
- }
|
||
|
- fprintf(log_fs, "\n");
|
||
|
- fflush(log_fs);
|
||
|
-}
|
||
|
-
|
||
|
-
|
||
|
|
||
|
int manage_loads() {
|
||
|
+ uint64_t time_stamp = get_time_stamp();
|
||
|
// Use temporary index to access and sort hash table entries
|
||
|
- static process_data_p *pindex;
|
||
|
static int pindex_size;
|
||
|
+ static process_data_p *pindex;
|
||
|
if (pindex_size < process_hash_table_size) {
|
||
|
pindex_size = process_hash_table_size;
|
||
|
pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
|
||
|
@@ -1923,34 +2179,69 @@ int manage_loads() {
|
||
|
return min_interval / 2;
|
||
|
}
|
||
|
memset(pindex, 0, pindex_size * sizeof(process_data_p));
|
||
|
- // Copy live candidate pointers to the index for sorting, etc
|
||
|
+ // Copy live candidate pointers to the index for sorting
|
||
|
+ // if they meet the threshold for memory usage and CPU usage.
|
||
|
int nprocs = 0;
|
||
|
+ long sum_CPUs_used = 0;
|
||
|
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
||
|
process_data_p p = &process_hash_table[ix];
|
||
|
- if (p->pid) {
|
||
|
+ if ((p->pid) && (p->CPUs_used * p->MBs_used > CPU_THRESHOLD * MEMORY_THRESHOLD)) {
|
||
|
pindex[nprocs++] = p;
|
||
|
+ sum_CPUs_used += p->CPUs_used;
|
||
|
+ // Initialize node list, if not already done for this process.
|
||
|
+ if (p->node_list_p == NULL) {
|
||
|
+ initialize_mem_node_list(p);
|
||
|
+ }
|
||
|
}
|
||
|
}
|
||
|
- // Sort index by amount of CPU used * amount of memory used. Not expecting
|
||
|
- // a long list here. Use a simple sort -- however, sort into bins,
|
||
|
- // treating values within 10% as aquivalent. Within bins, order by
|
||
|
- // bind_time_stamp so oldest bound will be higher priority to evaluate.
|
||
|
+ // Order candidate considerations using timestamps and magnitude: amount of
|
||
|
+ // CPU used * amount of memory used. Not expecting a long list here. Use
|
||
|
+ // a simplistic sort -- however move all not yet bound to front of list and
|
||
|
+ // order by decreasing magnitude. Previously bound processes follow in
|
||
|
+ // bins of increasing magnitude treating values within 20% as aquivalent.
|
||
|
+ // Within bins, order by bind_time_stamp so oldest bound will be higher
|
||
|
+ // priority to evaluate. Start by moving all unbound to beginning.
|
||
|
+ int num_unbound = 0;
|
||
|
for (int ij = 0; (ij < nprocs); ij++) {
|
||
|
+ if (pindex[ij]->bind_time_stamp == 0) {
|
||
|
+ process_data_p tmp = pindex[num_unbound];
|
||
|
+ pindex[num_unbound++] = pindex[ij];
|
||
|
+ pindex[ij] = tmp;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ // Sort all unbound so biggest magnitude comes first
|
||
|
+ for (int ij = 0; (ij < num_unbound); ij++) {
|
||
|
+ int best = ij;
|
||
|
+ for (int ik = ij + 1; (ik < num_unbound); ik++) {
|
||
|
+ uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_size);
|
||
|
+ uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size);
|
||
|
+ if (ik_mag <= best_mag) continue;
|
||
|
+ best = ik;
|
||
|
+ }
|
||
|
+ if (best != ij) {
|
||
|
+ process_data_p tmp = pindex[ij];
|
||
|
+ pindex[ij] = pindex[best];
|
||
|
+ pindex[best] = tmp;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ // Sort the remaining candidates into bins of increasting magnitude, and by
|
||
|
+ // timestamp within bins.
|
||
|
+ for (int ij = num_unbound; (ij < nprocs); ij++) {
|
||
|
int best = ij;
|
||
|
for (int ik = ij + 1; (ik < nprocs); ik++) {
|
||
|
- uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used);
|
||
|
- uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
|
||
|
+ uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_size);
|
||
|
+ uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_size);
|
||
|
uint64_t min_mag = ik_mag;
|
||
|
uint64_t diff_mag = best_mag - ik_mag;
|
||
|
if (diff_mag < 0) {
|
||
|
diff_mag = -(diff_mag);
|
||
|
min_mag = best_mag;
|
||
|
}
|
||
|
- if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
|
||
|
- // difference > 10 percent. Use strict ordering
|
||
|
- if (ik_mag <= best_mag) continue;
|
||
|
+ if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
|
||
|
+ // difference > 20 percent. Use magnitude ordering
|
||
|
+ if (ik_mag >= best_mag) continue;
|
||
|
} else {
|
||
|
- // difference within 10 percent. Sort these by bind_time_stamp.
|
||
|
+ // difference within 20 percent. Sort these by bind_time_stamp.
|
||
|
if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
|
||
|
}
|
||
|
best = ik;
|
||
|
@@ -1961,23 +2252,69 @@ int manage_loads() {
|
||
|
pindex[best] = tmp;
|
||
|
}
|
||
|
}
|
||
|
+ // Show the candidate processes in the log file
|
||
|
if ((log_level >= LOG_INFO) && (nprocs > 0)) {
|
||
|
- show_processes(pindex, nprocs);
|
||
|
+ numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
|
||
|
+ for (int ix = 0; (ix < nprocs); ix++) {
|
||
|
+ process_data_p p = pindex[ix];
|
||
|
+ char buf[BUF_SIZE];
|
||
|
+ str_from_id_list(buf, BUF_SIZE, p->node_list_p);
|
||
|
+ fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
|
||
|
+ p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
|
||
|
+ }
|
||
|
+ fflush(log_fs);
|
||
|
}
|
||
|
- // Estimate desired size and make resource requests for each significant process
|
||
|
+ // Estimate desired size (+ margin capacity) and
|
||
|
+ // make resource requests for each candidate process
|
||
|
for (int ix = 0; (ix < nprocs); ix++) {
|
||
|
process_data_p p = pindex[ix];
|
||
|
- if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
|
||
|
- break; // No more significant processes worth worrying about...
|
||
|
+ // If this process was recently bound, enforce a three-minute minimum
|
||
|
+ // delay between repeated attempts to potentially move the process.
|
||
|
+ // FIXME: make this delay contingent on node resource equity? Or,
|
||
|
+ // maybe change in running averages? Perhaps detect change in averages,
|
||
|
+ // or look at stddev? What is a good range for the delay? Discrete or
|
||
|
+ // continuous?
|
||
|
+#define MIN_DELAY_FOR_REEVALUATION (180 * ONE_HUNDRED)
|
||
|
+ if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
|
||
|
+ // Skip re-evaluation because we just did it recently.
|
||
|
+ if (log_level >= LOG_DEBUG) {
|
||
|
+ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
|
||
|
+ }
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+ // If this process has interleaved memory, recheck it only every 30 minutes...
|
||
|
+#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
|
||
|
+ if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
|
||
|
+ && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
|
||
|
+ if (log_level >= LOG_DEBUG) {
|
||
|
+ numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
|
||
|
+ }
|
||
|
+ continue;
|
||
|
}
|
||
|
- int mb_request = (p->MBs_used * 100) / target_utilization;
|
||
|
- int cpu_request = (p->CPUs_used * 100) / target_utilization;
|
||
|
- // Do not give a process more CPUs than it has threads!
|
||
|
- // FIXME: For guest VMs, should limit max to VCPU threads. Will
|
||
|
- // need to do something more intelligent with guest IO threads
|
||
|
- // when eventually considering devices and IRQs.
|
||
|
+ // Expand resources needed estimate using target_utilization factor.
|
||
|
+ // Start with the CPUs actually used (capped by number of threads) for
|
||
|
+ // CPUs required, but use the process virtual memory size for MBs
|
||
|
+ // requirement, (We previously used the RSS for MBs needed, but that
|
||
|
+ // caused problems with processes that had quickly expanding memory
|
||
|
+ // usage which also needed to cross NUMA boundaries. The downside of
|
||
|
+ // this choice is we might not pack processes as tightly as possible
|
||
|
+ // anymore. Hopefully this will be a relatively rare occurence in
|
||
|
+ // practice. KVM guests should not be significantly over-provisioned
|
||
|
+ // with memory they will never use!)
|
||
|
+ int mem_target_utilization = target_utilization;
|
||
|
+ int cpu_target_utilization = target_utilization;
|
||
|
+ // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
|
||
|
+ if (mem_target_utilization > 100) {
|
||
|
+ mem_target_utilization = 100;
|
||
|
+ }
|
||
|
+ int mb_request = (p->MBs_size * 100) / mem_target_utilization;
|
||
|
+ int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
|
||
|
+ // But do not give a process more CPUs than it has threads!
|
||
|
int thread_limit = p->num_threads;
|
||
|
- // If process looks like a KVM guest, try to limit to number of vCPU threads
|
||
|
+ // If process looks like a KVM guest, try to limit thread count to the
|
||
|
+ // number of vCPU threads. FIXME: Will need to do something more
|
||
|
+ // intelligent than this with guest IO threads when eventually
|
||
|
+ // considering devices and IRQs.
|
||
|
if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
|
||
|
int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
|
||
|
if (thread_limit > kvm_vcpu_threads) {
|
||
|
@@ -1988,18 +2325,18 @@ int manage_loads() {
|
||
|
if (cpu_request > thread_limit) {
|
||
|
cpu_request = thread_limit;
|
||
|
}
|
||
|
+ // OK, now pick NUMA nodes for this process and bind it!
|
||
|
pthread_mutex_lock(&node_info_mutex);
|
||
|
- id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
|
||
|
- // FIXME: ?? copy node_list_p to shorten mutex region?
|
||
|
- if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
|
||
|
- // Shorten interval if actively moving processes
|
||
|
+ int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
|
||
|
+ id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
|
||
|
+ if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
|
||
|
pthread_mutex_unlock(&node_info_mutex);
|
||
|
- p->bind_time_stamp = get_time_stamp();
|
||
|
+ // Return minimum interval when actively moving processes
|
||
|
return min_interval;
|
||
|
}
|
||
|
pthread_mutex_unlock(&node_info_mutex);
|
||
|
}
|
||
|
- // Return maximum interval if no process movement
|
||
|
+ // Return maximum interval when no process movement
|
||
|
return max_interval;
|
||
|
}
|
||
|
|
||
|
@@ -2013,6 +2350,18 @@ void *set_dynamic_options(void *arg) {
|
||
|
msg_t msg;
|
||
|
recv_msg(&msg);
|
||
|
switch (msg.body.cmd) {
|
||
|
+ case 'C':
|
||
|
+ use_inactive_file_cache = (msg.body.arg1 != 0);
|
||
|
+ if (use_inactive_file_cache) {
|
||
|
+ numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
|
||
|
+ } else {
|
||
|
+ numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
|
||
|
+ }
|
||
|
+ break;
|
||
|
+ case 'H':
|
||
|
+ thp_scan_sleep_ms = msg.body.arg1;
|
||
|
+ set_thp_scan_sleep_ms(thp_scan_sleep_ms);
|
||
|
+ break;
|
||
|
case 'i':
|
||
|
min_interval = msg.body.arg1;
|
||
|
max_interval = msg.body.arg2;
|
||
|
@@ -2055,6 +2404,11 @@ void *set_dynamic_options(void *arg) {
|
||
|
numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
|
||
|
}
|
||
|
break;
|
||
|
+ case 't':
|
||
|
+ numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
|
||
|
+ htt_percent = msg.body.arg1;
|
||
|
+ node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
|
||
|
+ break;
|
||
|
case 'u':
|
||
|
numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
|
||
|
target_utilization = msg.body.arg1;
|
||
|
@@ -2064,7 +2418,7 @@ void *set_dynamic_options(void *arg) {
|
||
|
msg.body.arg1, msg.body.arg2);
|
||
|
pthread_mutex_lock(&node_info_mutex);
|
||
|
update_nodes();
|
||
|
- id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
|
||
|
+ id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
|
||
|
str_from_id_list(buf, BUF_SIZE, node_list_p);
|
||
|
pthread_mutex_unlock(&node_info_mutex);
|
||
|
send_msg(msg.body.src_pid, 'w', 0, 0, buf);
|
||
|
@@ -2134,20 +2488,28 @@ void parse_two_arg_values(char *p, int *
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
int opt;
|
||
|
+ int C_flag = 0;
|
||
|
int d_flag = 0;
|
||
|
+ int H_flag = 0;
|
||
|
int i_flag = 0;
|
||
|
int K_flag = 0;
|
||
|
int l_flag = 0;
|
||
|
int p_flag = 0;
|
||
|
int r_flag = 0;
|
||
|
int S_flag = 0;
|
||
|
+ int t_flag = 0;
|
||
|
int u_flag = 0;
|
||
|
int v_flag = 0;
|
||
|
int w_flag = 0;
|
||
|
int x_flag = 0;
|
||
|
+ int tmp_int = 0;
|
||
|
long list_pid = 0;
|
||
|
- while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
|
||
|
+ while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
|
||
|
switch (opt) {
|
||
|
+ case 'C':
|
||
|
+ C_flag = 1;
|
||
|
+ use_inactive_file_cache = (atoi(optarg) != 0);
|
||
|
+ break;
|
||
|
case 'd':
|
||
|
d_flag = 1;
|
||
|
log_level = LOG_DEBUG;
|
||
|
@@ -2158,6 +2520,17 @@ int main(int argc, char *argv[]) {
|
||
|
case 'h':
|
||
|
print_usage_and_exit(argv[0]);
|
||
|
break;
|
||
|
+ case 'H':
|
||
|
+ tmp_int = atoi(optarg);
|
||
|
+ if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
|
||
|
+ // 0 means do not change the system default value
|
||
|
+ H_flag = 1;
|
||
|
+ thp_scan_sleep_ms = tmp_int;
|
||
|
+ } else {
|
||
|
+ fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ break;
|
||
|
case 'i':
|
||
|
i_flag = 1;
|
||
|
parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
|
||
|
@@ -2183,13 +2556,26 @@ int main(int argc, char *argv[]) {
|
||
|
include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
|
||
|
exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
|
||
|
break;
|
||
|
+ case 'R':
|
||
|
+ reserved_cpu_str = strdup(optarg);
|
||
|
+ break;
|
||
|
case 'S':
|
||
|
S_flag = 1;
|
||
|
scan_all_processes = (atoi(optarg) != 0);
|
||
|
break;
|
||
|
+ case 't':
|
||
|
+ tmp_int = atoi(optarg);
|
||
|
+ if ((tmp_int >= 0) && (tmp_int <= 100)) {
|
||
|
+ t_flag = 1;
|
||
|
+ htt_percent = tmp_int;
|
||
|
+ }
|
||
|
+ break;
|
||
|
case 'u':
|
||
|
- u_flag = 1;
|
||
|
- target_utilization = atoi(optarg);
|
||
|
+ tmp_int = atoi(optarg);
|
||
|
+ if ((tmp_int >= 10) && (tmp_int <= 130)) {
|
||
|
+ u_flag = 1;
|
||
|
+ target_utilization = tmp_int;
|
||
|
+ }
|
||
|
break;
|
||
|
case 'v':
|
||
|
v_flag = 1;
|
||
|
@@ -2234,6 +2620,12 @@ int main(int argc, char *argv[]) {
|
||
|
// Daemon is already running. So send dynamic options to persistant
|
||
|
// thread to handle requests, get the response (if any), and finish.
|
||
|
msg_t msg;
|
||
|
+ if (C_flag) {
|
||
|
+ send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
|
||
|
+ }
|
||
|
+ if (H_flag) {
|
||
|
+ send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
|
||
|
+ }
|
||
|
if (i_flag) {
|
||
|
send_msg(daemon_pid, 'i', min_interval, max_interval, "");
|
||
|
}
|
||
|
@@ -2252,6 +2644,9 @@ int main(int argc, char *argv[]) {
|
||
|
if (S_flag) {
|
||
|
send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
|
||
|
}
|
||
|
+ if (t_flag) {
|
||
|
+ send_msg(daemon_pid, 't', htt_percent, 0, "");
|
||
|
+ }
|
||
|
if (u_flag) {
|
||
|
send_msg(daemon_pid, 'u', target_utilization, 0, "");
|
||
|
}
|
||
|
@@ -2263,14 +2658,30 @@ int main(int argc, char *argv[]) {
|
||
|
if (x_flag) {
|
||
|
send_msg(daemon_pid, 'x', list_pid, 0, "");
|
||
|
}
|
||
|
- } else if (w_flag) {
|
||
|
- // Get pre-placement NUMA advice without starting daemon
|
||
|
+ close_log_file();
|
||
|
+ exit(EXIT_SUCCESS);
|
||
|
+ }
|
||
|
+ // No numad daemon running yet.
|
||
|
+ // First, make note of any reserved CPUs....
|
||
|
+ if (reserved_cpu_str != NULL) {
|
||
|
+ CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
|
||
|
+ int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
|
||
|
char buf[BUF_SIZE];
|
||
|
+ str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
|
||
|
+ numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
|
||
|
+ // turn reserved list into a negated mask for later ANDing use...
|
||
|
+ negate_list(reserved_cpu_mask_list_p);
|
||
|
+ }
|
||
|
+ // If it is a "-w" pre-placement request, handle that without starting
|
||
|
+ // the daemon. Otherwise start the numad daemon.
|
||
|
+ if (w_flag) {
|
||
|
+ // Get pre-placement NUMA advice without starting daemon
|
||
|
update_nodes();
|
||
|
sleep(2);
|
||
|
update_nodes();
|
||
|
numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
|
||
|
- id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
|
||
|
+ id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
|
||
|
+ char buf[BUF_SIZE];
|
||
|
str_from_id_list(buf, BUF_SIZE, node_list_p);
|
||
|
fprintf(stdout, "%s\n", buf);
|
||
|
close_log_file();
|
||
|
@@ -2278,6 +2689,7 @@ int main(int argc, char *argv[]) {
|
||
|
} else if (max_interval > 0) {
|
||
|
// Start the numad daemon...
|
||
|
check_prereqs(argv[0]);
|
||
|
+#if (!NO_DAEMON)
|
||
|
// Daemonize self...
|
||
|
daemon_pid = fork();
|
||
|
if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
|
||
|
@@ -2298,9 +2710,21 @@ int main(int argc, char *argv[]) {
|
||
|
if (log_fs != stderr) {
|
||
|
fclose(stderr);
|
||
|
}
|
||
|
+#endif
|
||
|
+ // Set up signal handlers
|
||
|
+ struct sigaction sa;
|
||
|
+ memset(&sa, 0, sizeof(sa));
|
||
|
+ sa.sa_handler = sig_handler;
|
||
|
+ if (sigaction(SIGHUP, &sa, NULL)
|
||
|
+ || sigaction(SIGTERM, &sa, NULL)
|
||
|
+ || sigaction(SIGQUIT, &sa, NULL)) {
|
||
|
+ numad_log(LOG_CRIT, "sigaction does not work?\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
// Allocate initial process hash table
|
||
|
process_hash_table_expand();
|
||
|
- // Spawn thread to handle messages from subsequent invocation requests
|
||
|
+ // Spawn a thread to handle messages from subsequent invocation requests
|
||
|
+ // and also a lazy background thread to clean up obsolete cpusets.
|
||
|
pthread_mutex_init(&pid_list_mutex, NULL);
|
||
|
pthread_mutex_init(&node_info_mutex, NULL);
|
||
|
pthread_attr_t attr;
|
||
|
@@ -2310,7 +2734,11 @@ int main(int argc, char *argv[]) {
|
||
|
}
|
||
|
pthread_t tid;
|
||
|
if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
|
||
|
- numad_log(LOG_CRIT, "pthread_create failure\n");
|
||
|
+ numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ if (pthread_create(&tid, &attr, &clean_obsolete_cpusets, &tid) != 0) {
|
||
|
+ numad_log(LOG_CRIT, "pthread_create failure: cleaning thread\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
// Loop here forwever...
|
||
|
@@ -2324,14 +2752,20 @@ int main(int argc, char *argv[]) {
|
||
|
interval = manage_loads();
|
||
|
}
|
||
|
sleep(interval);
|
||
|
+ if (got_sigterm | got_sigquit) {
|
||
|
+ shut_down_numad();
|
||
|
+ }
|
||
|
+ if (got_sighup) {
|
||
|
+ got_sighup = 0;
|
||
|
+ close_log_file();
|
||
|
+ open_log_file();
|
||
|
+ }
|
||
|
}
|
||
|
if (pthread_attr_destroy(&attr) != 0) {
|
||
|
numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
|
||
|
}
|
||
|
pthread_mutex_destroy(&pid_list_mutex);
|
||
|
pthread_mutex_destroy(&node_info_mutex);
|
||
|
- } else {
|
||
|
- shut_down_numad();
|
||
|
}
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
--- numad-0.5git/numad.8 2012-12-03 15:40:40.000000000 +0100
|
||
|
+++ new-rhel7/numad.8 2014-02-27 10:03:07.000000000 +0100
|
||
|
@@ -8,9 +8,15 @@ management for efficient use of CPUs and
|
||
|
numad [\fI\-dhvV\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
+numad [\fI\-C 0|1\fP]
|
||
|
+.br
|
||
|
+.LP
|
||
|
numad [\fI\-D non-standard-cgroup-mount-point\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
+numad [\fI\-H THP_hugepage_scan_sleep_ms\fP]
|
||
|
+.br
|
||
|
+.LP
|
||
|
numad [\fI\-i [min_interval:]max_interval\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
@@ -26,9 +32,15 @@ numad [\fI\-p PID\fP]
|
||
|
numad [\fI\-r PID\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
+numad [\fI\-R reserved-CPU-list\fP]
|
||
|
+.br
|
||
|
+.LP
|
||
|
numad [\fI\-S 0|1\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
+numad [\fI\-t logical_CPU_percent\fP]
|
||
|
+.br
|
||
|
+.LP
|
||
|
numad [\fI\-u target_utilization\fP]
|
||
|
.br
|
||
|
.LP
|
||
|
@@ -37,7 +49,6 @@ numad [\fI\-w NCPUS[:MB]\fP]
|
||
|
.LP
|
||
|
numad [\fI\-x PID\fP]
|
||
|
.br
|
||
|
-
|
||
|
.SH "DESCRIPTION"
|
||
|
.LP
|
||
|
Numad is a system daemon that monitors NUMA topology and resource usage. It
|
||
|
@@ -54,6 +65,13 @@ accesses will likely remain unpredictabl
|
||
|
performance.
|
||
|
.SH "OPTIONS"
|
||
|
.LP
|
||
|
+.TP
|
||
|
+\fB\-C\fR <\fI0|1\fP>
|
||
|
+This option controls whether or not numad treats inactive file cache as
|
||
|
+available memory. By default, numad assumes it can count inactive file cache as
|
||
|
+"free" memory when considering resources to match with processes. Specify
|
||
|
+\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
|
||
|
+resource.
|
||
|
.TP
|
||
|
\fB\-d\fR
|
||
|
Debug output in log, sets the log level to LOG_DEBUG. Same effect as \fI\-l 7\fP.
|
||
|
@@ -65,6 +83,16 @@ numad. This is not normally necessary.
|
||
|
\fB\-h\fR
|
||
|
Display usage help information and then exit.
|
||
|
.TP
|
||
|
+\fB\-H\fR <\fITHP_scan_sleep_ms\fP>
|
||
|
+Set the desired transparent hugepage scan interval in ms. The
|
||
|
+/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs tunable is
|
||
|
+usually set to 10000ms by the operating system. The default is changed by
|
||
|
+numad to be 1000ms since it is helpful for the hugepage daemon to be more
|
||
|
+aggressive when memory moves between nodes. If you don't like numad's choice
|
||
|
+of 1000ms, you can make the hugepage daemon more or less aggressive by
|
||
|
+specifying an alternate value with this option. Setting this value to 100ms
|
||
|
+might improve some workloads which use many transparent hugepages.
|
||
|
+.TP
|
||
|
\fB\-i\fR <\fI[min_interval:]max_interval\fP>
|
||
|
Sets the time interval that numad waits between system scans, in seconds to
|
||
|
<\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
|
||
|
@@ -85,7 +113,9 @@ large in-memory database), you might get
|
||
|
.TP
|
||
|
\fB\-l\fR <\fIlog_level\fP>
|
||
|
Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7.
|
||
|
-The default value is 5.
|
||
|
+The default value is 5. Note that CPU values are scaled by a factor of 100
|
||
|
+internally and in the numad log files. Unfortunately, you don't actually have
|
||
|
+that many CPUs.
|
||
|
.TP
|
||
|
\fB\-p\fR <\fIPID\fP>
|
||
|
Add PID to explicit inclusion list of processes to consider for managing, if
|
||
|
@@ -102,6 +132,12 @@ processes. After daemon start, only one
|
||
|
process lists per subsequent numad invocation. Use with \-S and \-p and \-x to
|
||
|
precisely control the scope of processes numad can manage.
|
||
|
.TP
|
||
|
+\fB\-R\fR <\fICPU_LIST\fP>
|
||
|
+Specify a list of CPUs that numad should assume are reserved for non-numad use.
|
||
|
+No processes will be bound to the specified CPUs by numad. This option is
|
||
|
+effective only when starting numad. You cannot change reserved CPUs
|
||
|
+dynamically while numad is already running.
|
||
|
+.TP
|
||
|
\fB\-S\fR <\fI0|1\fP>
|
||
|
This option controls whether numad scans all system processes or only the
|
||
|
processes on the explicit inclusion PID list. The default is to scan all
|
||
|
@@ -114,10 +150,19 @@ exclusion list). Starting numad as
|
||
|
will limit scanning, and thus also automatic NUMA management, to only those
|
||
|
three explicitly specified processes.
|
||
|
.TP
|
||
|
+\fB\-t\fR <\fIlogical_CPU_percent\fP>
|
||
|
+Determine the resource value of logical CPUs. Hardware threads typically share
|
||
|
+most core resources, and so add only a fraction of CPU power for many
|
||
|
+workloads. By default numad considers logical CPUs to be only 20 percent of a
|
||
|
+dedicated core.
|
||
|
+.TP
|
||
|
\fB\-u\fR <\fItarget_utilization\fP>
|
||
|
Set the desired maximum consumption percentage of a node. Default is 85%.
|
||
|
Decrease the target value to maintain more available resource margin on each
|
||
|
node. Increase the target value to more exhaustively consume node resources.
|
||
|
+It is possible to specify values up to 130 percent, to oversubscribe CPUs in
|
||
|
+the nodes, but memory utilization is capped at 100%. Use oversubscription
|
||
|
+values carefully.
|
||
|
.TP
|
||
|
\fB\-v\fR
|
||
|
Verbose output in log, sets the log level to LOG_INFO. Same effect as \fI\-l 6\fP.
|
||
|
@@ -159,18 +204,21 @@ numad can manage.
|
||
|
None.
|
||
|
.SH "EXAMPLES"
|
||
|
.LP
|
||
|
-Numad is normally run as a system daemon and should be managed by the
|
||
|
+Numad can be run as a system daemon and can be managed by the
|
||
|
standard init mechanisms of the host.
|
||
|
.LP
|
||
|
If interactive (manual) control is desired, you can start the daemon manually by typing:
|
||
|
.LP
|
||
|
/usr/bin/numad
|
||
|
.LP
|
||
|
-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
|
||
|
+Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
|
||
|
+.LP
|
||
|
+You can terminate numad from running by typing:
|
||
|
+.LP
|
||
|
+/usr/bin/numad -i0
|
||
|
.SH "AUTHORS"
|
||
|
.LP
|
||
|
Bill Gray <bgray@redhat.com>
|
||
|
.SH "SEE ALSO"
|
||
|
.LP
|
||
|
numactl(8)
|
||
|
-
|