diff --git a/Makefile.in b/Makefile.in
index e3cd296c..9ae28bac 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -46,7 +46,7 @@ CFLAGS=@CFLAGS@
 CFLAGS_NOPIE=@CFLAGS_NOPIE@
 CPPFLAGS=-I. -I$(srcdir) @CPPFLAGS@ $(PATHS) @DEFS@
 PICFLAG=@PICFLAG@
-LIBS=@LIBS@
+LIBS=@LIBS@ -lpthread
 K5LIBS=@K5LIBS@
 GSSLIBS=@GSSLIBS@
 SSHDLIBS=@SSHDLIBS@
@@ -92,7 +92,7 @@ LIBOPENSSH_OBJS=\
 LIBSSH_OBJS=${LIBOPENSSH_OBJS} \
 	authfd.o authfile.o \
 	canohost.o channels.o cipher.o cipher-aes.o cipher-aesctr.o \
-	cipher-ctr.o cleanup.o \
+	cipher-ctr.o cleanup.o cipher-ctr-mt.o \
 	compat.o fatal.o hostfile.o \
 	log.o match.o moduli.o nchan.o packet.o \
 	readpass.o ttymodes.o xmalloc.o addr.o addrmatch.o \
diff --git a/cipher-ctr-mt.c b/cipher-ctr-mt.c
new file mode 100644
index 00000000..71172f1b
--- /dev/null
+++ b/cipher-ctr-mt.c
@@ -0,0 +1,679 @@
+/*
+ * OpenSSH Multi-threaded AES-CTR Cipher
+ *
+ * Author: Benjamin Bennett <ben@psc.edu>
+ * Author: Mike Tasota <tasota@gmail.com>
+ * Author: Chris Rapier <rapier@psc.edu>
+ * Copyright (c) 2008-2013 Pittsburgh Supercomputing Center. All rights reserved.
+ *
+ * Based on original OpenSSH AES-CTR cipher. Small portions remain unchanged,
+ * Copyright (c) 2003 Markus Friedl <markus@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include "includes.h"
+
+#if defined(WITH_OPENSSL)
+#include <sys/types.h>
+
+#include <stdarg.h>
+#include <string.h>
+
+#include <openssl/evp.h>
+
+#include "xmalloc.h"
+#include "log.h"
+#include <unistd.h>
+
+/* compatibility with old or broken OpenSSL versions */
+#include "openbsd-compat/openssl-compat.h"
+
+#ifndef USE_BUILTIN_RIJNDAEL
+#include <openssl/aes.h>
+#endif
+
+#include <pthread.h>
+
+/*-------------------- TUNABLES --------------------*/
+/* maximum number of threads and queues */
+#define MAX_THREADS      32 
+#define MAX_NUMKQ        (MAX_THREADS * 2)
+
+/* Number of pregen threads to use */
+int cipher_threads = 2;
+
+/* Number of keystream queues */
+int numkq = 4;
+
+/* Length of a keystream queue */
+#define KQLEN		4096
+
+/* Processor cacheline length */
+#define CACHELINE_LEN	64
+
+/* Collect thread stats and print at cancellation when in debug mode */
+#define CIPHER_THREAD_STATS
+
+/* Can the system do unaligned loads natively? */
+#if defined(__aarch64__) || \
+    defined(__i386__)    || \
+    defined(__powerpc__) || \
+    defined(__x86_64__)
+# define CIPHER_UNALIGNED_OK
+#endif
+#if defined(__SIZEOF_INT128__)
+# define CIPHER_INT128_OK
+#endif
+/*-------------------- END TUNABLES --------------------*/
+
+
+const EVP_CIPHER *evp_aes_ctr_mt(void);
+
+#ifdef CIPHER_THREAD_STATS
+/*
+ * Struct to collect thread stats
+ */
+struct thread_stats {
+	u_int	fills;
+	u_int	skips;
+	u_int	waits;
+	u_int	drains;
+};
+
+/*
+ * Debug print the thread stats
+ * Use with pthread_cleanup_push for displaying at thread cancellation
+ */
+static void
+thread_loop_stats(void *x)
+{
+	struct thread_stats *s = x;
+
+	debug("AES-CTR MT tid %lu - %u fills, %u skips, %u waits", pthread_self(),
+			s->fills, s->skips, s->waits);
+}
+
+# define STATS_STRUCT(s)	struct thread_stats s
+# define STATS_INIT(s)		{ memset(&s, 0, sizeof(s)); }
+# define STATS_FILL(s)		{ s.fills++; }
+# define STATS_SKIP(s)		{ s.skips++; }
+# define STATS_WAIT(s)		{ s.waits++; }
+# define STATS_DRAIN(s)		{ s.drains++; }
+#else
+# define STATS_STRUCT(s)
+# define STATS_INIT(s)
+# define STATS_FILL(s)
+# define STATS_SKIP(s)
+# define STATS_WAIT(s)
+# define STATS_DRAIN(s)
+#endif
+
+/* Keystream Queue state */
+enum {
+	KQINIT,
+	KQEMPTY,
+	KQFILLING,
+	KQFULL,
+	KQDRAINING
+};
+
+/* Keystream Queue struct */
+struct kq {
+	u_char		keys[KQLEN][AES_BLOCK_SIZE];
+	u_char		ctr[AES_BLOCK_SIZE];
+	u_char		pad0[CACHELINE_LEN];
+	int		qstate;
+	pthread_mutex_t	lock;
+	pthread_cond_t	cond;
+	u_char		pad1[CACHELINE_LEN];
+};
+
+/* Context struct */
+struct ssh_aes_ctr_ctx_mt
+{
+	int             struct_id;
+	struct kq	q[MAX_NUMKQ];
+	AES_KEY		aes_ctx;
+	STATS_STRUCT(stats);
+	u_char		aes_counter[AES_BLOCK_SIZE];
+	pthread_t	tid[MAX_THREADS];
+	int             id[MAX_THREADS];
+	pthread_rwlock_t tid_lock;
+#ifdef __APPLE__
+	pthread_rwlock_t stop_lock;
+	int		exit_flag;
+#endif /* __APPLE__ */
+	int		state;
+	int		qidx;
+	int		ridx;
+};
+
+/* <friedl>
+ * increment counter 'ctr',
+ * the counter is of size 'len' bytes and stored in network-byte-order.
+ * (LSB at ctr[len-1], MSB at ctr[0])
+ */
+static void
+ssh_ctr_inc(u_char *ctr, size_t len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0; i--)
+		if (++ctr[i])	/* continue on overflow */
+			return;
+}
+
+/*
+ * Add num to counter 'ctr'
+ */
+static void
+ssh_ctr_add(u_char *ctr, uint32_t num, u_int len)
+{
+	int i;
+	uint16_t n;
+
+	for (n = 0, i = len - 1; i >= 0 && (num || n); i--) {
+		n = ctr[i] + (num & 0xff) + n;
+		num >>= 8;
+		ctr[i] = n & 0xff;
+		n >>= 8;
+	}
+}
+
+/*
+ * Threads may be cancelled in a pthread_cond_wait, we must free the mutex
+ */
+static void
+thread_loop_cleanup(void *x)
+{
+	pthread_mutex_unlock((pthread_mutex_t *)x);
+}
+
+#ifdef __APPLE__
+/* Check if we should exit, we are doing both cancel and exit condition
+ * since on OSX threads seem to occasionally fail to notice when they have
+ * been cancelled. We want to have a backup to make sure that we won't hang
+ * when the main process join()-s the cancelled thread.
+ */
+static void
+thread_loop_check_exit(struct ssh_aes_ctr_ctx_mt *c)
+{
+	int exit_flag;
+
+	pthread_rwlock_rdlock(&c->stop_lock);
+	exit_flag = c->exit_flag;
+	pthread_rwlock_unlock(&c->stop_lock);
+
+	if (exit_flag)
+		pthread_exit(NULL);
+}
+#else
+# define thread_loop_check_exit(s)
+#endif /* __APPLE__ */
+
+/*
+ * Helper function to terminate the helper threads
+ */
+static void
+stop_and_join_pregen_threads(struct ssh_aes_ctr_ctx_mt *c)
+{
+	int i;
+
+#ifdef __APPLE__
+	/* notify threads that they should exit */
+	pthread_rwlock_wrlock(&c->stop_lock);
+	c->exit_flag = TRUE;
+	pthread_rwlock_unlock(&c->stop_lock);
+#endif /* __APPLE__ */
+
+	/* Cancel pregen threads */
+	for (i = 0; i < cipher_threads; i++) {
+		debug ("Canceled %lu (%d,%d)", c->tid[i], c->struct_id, c->id[i]);
+		pthread_cancel(c->tid[i]);
+	}
+	/* shouldn't need this - see commit logs for hpn-7_7_P1 -cjr 11/7/19*/
+	/* for (i = 0; i < numkq; i++) { */
+	/* 	pthread_mutex_lock(&c->q[i].lock); */
+	/* 	pthread_cond_broadcast(&c->q[i].cond); */
+	/* 	pthread_mutex_unlock(&c->q[i].lock); */
+	/* } */
+	for (i = 0; i < cipher_threads; i++) {
+		if (pthread_kill(c->tid[i], 0) != 0)
+			debug3("AES-CTR MT pthread_join failure: Invalid thread id %lu in %s", c->tid[i], __FUNCTION__);
+		else {
+			debug ("Joining %lu (%d, %d)", c->tid[i], c->struct_id, c->id[i]);
+			pthread_join(c->tid[i], NULL);
+		}
+	}
+}
+
+/*
+ * The life of a pregen thread:
+ *    Find empty keystream queues and fill them using their counter.
+ *    When done, update counter for the next fill.
+ */
+static void *
+thread_loop(void *x)
+{
+	AES_KEY key;
+	STATS_STRUCT(stats);
+	struct ssh_aes_ctr_ctx_mt *c = x;
+	struct kq *q;
+	int i;
+	int qidx;
+	pthread_t first_tid;
+
+	/* Threads stats on cancellation */
+	STATS_INIT(stats);
+#ifdef CIPHER_THREAD_STATS
+	pthread_cleanup_push(thread_loop_stats, &stats);
+#endif
+
+	/* Thread local copy of AES key */
+	memcpy(&key, &c->aes_ctx, sizeof(key));
+
+	pthread_rwlock_rdlock(&c->tid_lock);
+	first_tid = c->tid[0];
+	pthread_rwlock_unlock(&c->tid_lock);
+
+	/*
+	 * Handle the special case of startup, one thread must fill
+	 * the first KQ then mark it as draining. Lock held throughout.
+	 */
+	if (pthread_equal(pthread_self(), first_tid)) {
+		q = &c->q[0];
+		pthread_mutex_lock(&q->lock);
+		if (q->qstate == KQINIT) {
+			for (i = 0; i < KQLEN; i++) {
+				AES_encrypt(q->ctr, q->keys[i], &key);
+				ssh_ctr_inc(q->ctr, AES_BLOCK_SIZE);
+			}
+			ssh_ctr_add(q->ctr, KQLEN * (numkq - 1), AES_BLOCK_SIZE);
+			q->qstate = KQDRAINING;
+			STATS_FILL(stats);
+			pthread_cond_broadcast(&q->cond);
+		}
+		pthread_mutex_unlock(&q->lock);
+	} else
+		STATS_SKIP(stats);
+
+	/*
+	 * Normal case is to find empty queues and fill them, skipping over
+	 * queues already filled by other threads and stopping to wait for
+	 * a draining queue to become empty.
+	 *
+	 * Multiple threads may be waiting on a draining queue and awoken
+	 * when empty.  The first thread to wake will mark it as filling,
+	 * others will move on to fill, skip, or wait on the next queue.
+	 */
+	for (qidx = 1;; qidx = (qidx + 1) % numkq) {
+		/* Check if I was cancelled, also checked in cond_wait */
+		pthread_testcancel();
+
+		/* Check if we should exit as well */
+		thread_loop_check_exit(c);
+
+		/* Lock queue and block if its draining */
+		q = &c->q[qidx];
+		pthread_mutex_lock(&q->lock);
+		pthread_cleanup_push(thread_loop_cleanup, &q->lock);
+		while (q->qstate == KQDRAINING || q->qstate == KQINIT) {
+			STATS_WAIT(stats);
+			thread_loop_check_exit(c);
+			pthread_cond_wait(&q->cond, &q->lock);
+		}
+		pthread_cleanup_pop(0);
+
+		/* If filling or full, somebody else got it, skip */
+		if (q->qstate != KQEMPTY) {
+			pthread_mutex_unlock(&q->lock);
+			STATS_SKIP(stats);
+			continue;
+		}
+
+		/*
+		 * Empty, let's fill it.
+		 * Queue lock is relinquished while we do this so others
+		 * can see that it's being filled.
+		 */
+		q->qstate = KQFILLING;
+		pthread_cond_broadcast(&q->cond);
+		pthread_mutex_unlock(&q->lock);
+		for (i = 0; i < KQLEN; i++) {
+			AES_encrypt(q->ctr, q->keys[i], &key);
+			ssh_ctr_inc(q->ctr, AES_BLOCK_SIZE);
+		}
+
+		/* Re-lock, mark full and signal consumer */
+		pthread_mutex_lock(&q->lock);
+		ssh_ctr_add(q->ctr, KQLEN * (numkq - 1), AES_BLOCK_SIZE);
+		q->qstate = KQFULL;
+		STATS_FILL(stats);
+		pthread_cond_broadcast(&q->cond);
+		pthread_mutex_unlock(&q->lock);
+	}
+
+#ifdef CIPHER_THREAD_STATS
+	/* Stats */
+	pthread_cleanup_pop(1);
+#endif
+
+	return NULL;
+}
+
+static int
+ssh_aes_ctr(EVP_CIPHER_CTX *ctx, u_char *dest, const u_char *src,
+    LIBCRYPTO_EVP_INL_TYPE len)
+{
+	typedef union {
+#ifdef CIPHER_INT128_OK
+		__uint128_t *u128;
+#endif
+		uint64_t *u64;
+		uint32_t *u32;
+		uint8_t *u8;
+		const uint8_t *cu8;
+		uintptr_t u;
+	} ptrs_t;
+	ptrs_t destp, srcp, bufp;
+	uintptr_t align;
+	struct ssh_aes_ctr_ctx_mt *c;
+	struct kq *q, *oldq;
+	int ridx;
+	u_char *buf;
+
+	if (len == 0)
+		return 1;
+	if ((c = EVP_CIPHER_CTX_get_app_data(ctx)) == NULL)
+		return 0;
+
+	q = &c->q[c->qidx];
+	ridx = c->ridx;
+
+	/* src already padded to block multiple */
+	srcp.cu8 = src;
+	destp.u8 = dest;
+	while (len > 0) {
+		buf = q->keys[ridx];
+		bufp.u8 = buf;
+
+		/* figure out the alignment on the fly */
+#ifdef CIPHER_UNALIGNED_OK
+		align = 0;
+#else
+		align = destp.u | srcp.u | bufp.u;
+#endif
+
+#ifdef CIPHER_INT128_OK
+		if ((align & 0xf) == 0) {
+			destp.u128[0] = srcp.u128[0] ^ bufp.u128[0];
+		} else
+#endif
+		if ((align & 0x7) == 0) {
+			destp.u64[0] = srcp.u64[0] ^ bufp.u64[0];
+			destp.u64[1] = srcp.u64[1] ^ bufp.u64[1];
+		} else if ((align & 0x3) == 0) {
+			destp.u32[0] = srcp.u32[0] ^ bufp.u32[0];
+			destp.u32[1] = srcp.u32[1] ^ bufp.u32[1];
+			destp.u32[2] = srcp.u32[2] ^ bufp.u32[2];
+			destp.u32[3] = srcp.u32[3] ^ bufp.u32[3];
+		} else {
+			size_t i;
+			for (i = 0; i < AES_BLOCK_SIZE; ++i)
+				dest[i] = src[i] ^ buf[i];
+		}
+
+		destp.u += AES_BLOCK_SIZE;
+		srcp.u += AES_BLOCK_SIZE;
+		len -= AES_BLOCK_SIZE;
+		ssh_ctr_inc(c->aes_counter, AES_BLOCK_SIZE);
+
+		/* Increment read index, switch queues on rollover */
+		if ((ridx = (ridx + 1) % KQLEN) == 0) {
+			oldq = q;
+
+			/* Mark next queue draining, may need to wait */
+			c->qidx = (c->qidx + 1) % numkq;
+			q = &c->q[c->qidx];
+			pthread_mutex_lock(&q->lock);
+			while (q->qstate != KQFULL) {
+				STATS_WAIT(c->stats);
+				pthread_cond_wait(&q->cond, &q->lock);
+			}
+			q->qstate = KQDRAINING;
+			pthread_cond_broadcast(&q->cond);
+			pthread_mutex_unlock(&q->lock);
+
+			/* Mark consumed queue empty and signal producers */
+			pthread_mutex_lock(&oldq->lock);
+			oldq->qstate = KQEMPTY;
+			STATS_DRAIN(c->stats);
+			pthread_cond_broadcast(&oldq->cond);
+			pthread_mutex_unlock(&oldq->lock);
+		}
+	}
+	c->ridx = ridx;
+	return 1;
+}
+
+#define HAVE_NONE       0
+#define HAVE_KEY        1
+#define HAVE_IV         2
+
+int X = 0;
+
+static int
+ssh_aes_ctr_init(EVP_CIPHER_CTX *ctx, const u_char *key, const u_char *iv,
+    int enc)
+{
+	struct ssh_aes_ctr_ctx_mt *c;
+	int i;
+
+	/* get the number of cores in the system */
+	/* if it's not linux it currently defaults to 2 */
+	/* divide by 2 to get threads for each direction (MODE_IN||MODE_OUT) */
+#ifdef __linux__
+	cipher_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
+#endif /*__linux__*/
+#ifdef __APPLE__
+	cipher_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
+#endif /*__APPLE__*/
+#ifdef __FREEBSD__
+	int req[2];
+	size_t len;
+
+	req[0] = CTL_HW;
+	req[1] = HW_NCPU;
+
+	len = sizeof(ncpu);
+        sysctl(req, 2, &cipher_threads, &len, NULL, 0);
+	cipher_threads = cipher_threads / 2;
+#endif /*__FREEBSD__*/
+
+	/* if they have less than 4 cores spin up 4 threads anyway */
+	if (cipher_threads < 2) 
+		cipher_threads = 2;
+		
+	/* assure that we aren't trying to create more threads */
+	/* than we have in the struct. cipher_threads is half the */
+	/* total of allowable threads hence the odd looking math here */
+	if (cipher_threads * 2 > MAX_THREADS)
+		cipher_threads = MAX_THREADS / 2;
+
+	/* set the number of keystream queues */
+	numkq = cipher_threads * 2;
+
+	if ((c = EVP_CIPHER_CTX_get_app_data(ctx)) == NULL) {
+		c = xmalloc(sizeof(*c));
+		pthread_rwlock_init(&c->tid_lock, NULL);
+#ifdef __APPLE__
+		pthread_rwlock_init(&c->stop_lock, NULL);
+		c->exit_flag = FALSE;
+#endif /* __APPLE__ */
+
+		c->state = HAVE_NONE;
+		for (i = 0; i < numkq; i++) {
+			pthread_mutex_init(&c->q[i].lock, NULL);
+			pthread_cond_init(&c->q[i].cond, NULL);
+		}
+
+		STATS_INIT(c->stats);
+		EVP_CIPHER_CTX_set_app_data(ctx, c);
+	}
+
+	if (c->state == (HAVE_KEY | HAVE_IV)) {
+		/* tell the pregen threads to exit */
+		stop_and_join_pregen_threads(c);
+		
+#ifdef __APPLE__
+		/* reset the exit flag */
+		c->exit_flag = FALSE;
+#endif /* __APPLE__ */
+
+		/* Start over getting key & iv */
+		c->state = HAVE_NONE;
+	}
+
+	if (key != NULL) {
+		AES_set_encrypt_key(key, EVP_CIPHER_CTX_key_length(ctx) * 8,
+		    &c->aes_ctx);
+		c->state |= HAVE_KEY;
+	}
+
+	if (iv != NULL) {
+		memcpy(c->aes_counter, iv, AES_BLOCK_SIZE);
+		c->state |= HAVE_IV;
+	}
+
+	if (c->state == (HAVE_KEY | HAVE_IV)) {
+		/* Clear queues */
+		memcpy(c->q[0].ctr, c->aes_counter, AES_BLOCK_SIZE);
+		c->q[0].qstate = KQINIT;
+		for (i = 1; i < numkq; i++) {
+			memcpy(c->q[i].ctr, c->aes_counter, AES_BLOCK_SIZE);
+			ssh_ctr_add(c->q[i].ctr, i * KQLEN, AES_BLOCK_SIZE);
+			c->q[i].qstate = KQEMPTY;
+		}
+		c->qidx = 0;
+		c->ridx = 0;
+
+		/* Start threads */
+		for (i = 0; i < cipher_threads; i++) {
+			pthread_rwlock_wrlock(&c->tid_lock);
+			if (pthread_create(&c->tid[i], NULL, thread_loop, c) != 0)
+				debug ("AES-CTR MT Could not create thread in %s", __FUNCTION__); /*should die here */
+			else {
+				if (!c->struct_id)
+					c->struct_id = X++;
+				c->id[i] = i;
+				debug ("AES-CTR MT spawned a thread with id %lu in %s (%d, %d)", c->tid[i], __FUNCTION__, c->struct_id, c->id[i]);
+			}
+			pthread_rwlock_unlock(&c->tid_lock);
+		}
+		pthread_mutex_lock(&c->q[0].lock);
+		while (c->q[0].qstate == KQINIT)
+			pthread_cond_wait(&c->q[0].cond, &c->q[0].lock);
+		pthread_mutex_unlock(&c->q[0].lock);
+	}
+	return 1;
+}
+
+/* this function is no longer used but might prove handy in the future
+ * this comment also applies to ssh_aes_ctr_thread_reconstruction
+ */
+void
+ssh_aes_ctr_thread_destroy(EVP_CIPHER_CTX *ctx)
+{
+	struct ssh_aes_ctr_ctx_mt *c;
+
+	c = EVP_CIPHER_CTX_get_app_data(ctx);
+	stop_and_join_pregen_threads(c);
+}
+
+void
+ssh_aes_ctr_thread_reconstruction(EVP_CIPHER_CTX *ctx)
+{
+	struct ssh_aes_ctr_ctx_mt *c;
+	int i;
+	c = EVP_CIPHER_CTX_get_app_data(ctx);
+	/* reconstruct threads */
+	for (i = 0; i < cipher_threads; i++) {
+		pthread_rwlock_wrlock(&c->tid_lock);
+		if (pthread_create(&c->tid[i], NULL, thread_loop, c) !=0 )
+			debug("AES-CTR MT could not create thread in %s", __FUNCTION__);
+		else {
+			c->struct_id = X++;
+			c->id[i] = i;
+			debug ("AES-CTR MT spawned a thread with id %lu in %s (%d, %d)", c->tid[i], __FUNCTION__, c->struct_id, c->id[i]);
+			debug("AES-CTR MT spawned a thread with id %lu in %s", c->tid[i], __FUNCTION__);
+		}
+		pthread_rwlock_unlock(&c->tid_lock);
+	}
+}
+
+static int
+ssh_aes_ctr_cleanup(EVP_CIPHER_CTX *ctx)
+{
+	struct ssh_aes_ctr_ctx_mt *c;
+
+	if ((c = EVP_CIPHER_CTX_get_app_data(ctx)) != NULL) {
+#ifdef CIPHER_THREAD_STATS
+		debug("AES-CTR MT main thread: %u drains, %u waits", c->stats.drains,
+		      c->stats.waits);
+#endif
+		stop_and_join_pregen_threads(c);
+
+		memset(c, 0, sizeof(*c));
+		free(c);
+		EVP_CIPHER_CTX_set_app_data(ctx, NULL);
+	}
+	return 1;
+}
+
+/* <friedl> */
+const EVP_CIPHER *
+evp_aes_ctr_mt(void)
+{
+# if OPENSSL_VERSION_NUMBER >= 0x10100000UL
+	static EVP_CIPHER *aes_ctr;
+	aes_ctr = EVP_CIPHER_meth_new(NID_undef, 16/*block*/, 16/*key*/);
+	EVP_CIPHER_meth_set_iv_length(aes_ctr, AES_BLOCK_SIZE);
+	EVP_CIPHER_meth_set_init(aes_ctr, ssh_aes_ctr_init);
+	EVP_CIPHER_meth_set_cleanup(aes_ctr, ssh_aes_ctr_cleanup);
+	EVP_CIPHER_meth_set_do_cipher(aes_ctr, ssh_aes_ctr);
+#  ifndef SSH_OLD_EVP
+	EVP_CIPHER_meth_set_flags(aes_ctr, EVP_CIPH_CBC_MODE
+				      | EVP_CIPH_VARIABLE_LENGTH
+				      | EVP_CIPH_ALWAYS_CALL_INIT
+				      | EVP_CIPH_CUSTOM_IV);
+#  endif /*SSH_OLD_EVP*/
+	return (aes_ctr);
+# else /*earlier versions of openssl*/
+	static EVP_CIPHER aes_ctr;
+	memset(&aes_ctr, 0, sizeof(EVP_CIPHER));
+	aes_ctr.nid = NID_undef;
+	aes_ctr.block_size = AES_BLOCK_SIZE;
+	aes_ctr.iv_len = AES_BLOCK_SIZE;
+	aes_ctr.key_len = 16;
+	aes_ctr.init = ssh_aes_ctr_init;
+	aes_ctr.cleanup = ssh_aes_ctr_cleanup;
+	aes_ctr.do_cipher = ssh_aes_ctr;
+#  ifndef SSH_OLD_EVP
+        aes_ctr.flags = EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH |
+		EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CUSTOM_IV;
+#  endif /*SSH_OLD_EVP*/
+        return &aes_ctr;
+# endif /*OPENSSH_VERSION_NUMBER*/
+}
+
+#endif /* defined(WITH_OPENSSL) */
diff --git a/cipher.c b/cipher.c
index 639511cf..555a3356 100644
--- a/cipher.c
+++ b/cipher.c
@@ -55,6 +55,9 @@
 #define EVP_CIPHER_CTX void
 #endif
 
+/* for multi-threaded aes-ctr cipher */
+extern const EVP_CIPHER *evp_aes_ctr_mt(void);
+
 struct sshcipher_ctx {
 	int	plaintext;
 	int	encrypt;
@@ -83,7 +86,7 @@ struct sshcipher {
 #endif
 };
 
-static const struct sshcipher ciphers[] = {
+static struct sshcipher ciphers[] = {
 #ifdef WITH_OPENSSL
 #ifndef OPENSSL_NO_DES
 	{ "3des-cbc",		8, 24, 0, 0, CFLAG_CBC, EVP_des_ede3_cbc },
@@ -152,6 +155,29 @@ compression_alg_list(int compression)
 #endif
 }
 
+/* used to get the cipher name so when force rekeying to handle the
+ * single to multithreaded ctr cipher swap we only rekey when appropriate
+ */
+const char *
+cipher_ctx_name(const struct sshcipher_ctx *cc)
+{
+	return cc->cipher->name;
+}
+
+/* in order to get around sandbox and forking issues with a threaded cipher
+ * we set the initial pre-auth aes-ctr cipher to the default OpenSSH cipher
+ * post auth we set them to the new evp as defined by cipher-ctr-mt
+ */
+#ifdef WITH_OPENSSL
+void
+cipher_reset_multithreaded(void)
+{
+	cipher_by_name("aes128-ctr")->evptype = evp_aes_ctr_mt;
+	cipher_by_name("aes192-ctr")->evptype = evp_aes_ctr_mt;
+	cipher_by_name("aes256-ctr")->evptype = evp_aes_ctr_mt;
+}
+#endif
+
 u_int
 cipher_blocksize(const struct sshcipher *c)
 {
@@ -201,10 +227,10 @@ cipher_ctx_is_plaintext(struct sshcipher_ctx *cc)
 	return cc->plaintext;
 }
 
-const struct sshcipher *
+struct sshcipher *
 cipher_by_name(const char *name)
 {
-	const struct sshcipher *c;
+	struct sshcipher *c;
 	for (c = ciphers; c->name != NULL; c++)
 		if (strcmp(c->name, name) == 0)
 			return c;
diff --git a/cipher.h b/cipher.h
index 1a591cd7..7bcc7fa3 100644
--- a/cipher.h
+++ b/cipher.h
@@ -50,7 +50,9 @@
 struct sshcipher;
 struct sshcipher_ctx;
 
-const struct sshcipher *cipher_by_name(const char *);
+void ssh_aes_ctr_thread_destroy(EVP_CIPHER_CTX *ctx); // defined in cipher-ctr-mt.c
+void ssh_aes_ctr_thread_reconstruction(EVP_CIPHER_CTX *ctx);
+struct sshcipher *cipher_by_name(const char *);
 const char *cipher_warning_message(const struct sshcipher_ctx *);
 int	 ciphers_valid(const char *);
 char	*cipher_alg_list(char, int);
@@ -68,6 +70,8 @@ u_int	 cipher_seclen(const struct sshcipher *);
 u_int	 cipher_authlen(const struct sshcipher *);
 u_int	 cipher_ivlen(const struct sshcipher *);
 u_int	 cipher_is_cbc(const struct sshcipher *);
+void	 cipher_reset_multithreaded(void);
+const char *cipher_ctx_name(const struct sshcipher_ctx *);
 
 u_int	 cipher_ctx_is_plaintext(struct sshcipher_ctx *);
 
diff --git a/packet.c b/packet.c
index 4bd8b4ec..bd1e2c55 100644
--- a/packet.c
+++ b/packet.c
@@ -293,7 +293,7 @@ struct ssh *
 ssh_packet_set_connection(struct ssh *ssh, int fd_in, int fd_out)
 {
 	struct session_state *state;
-	const struct sshcipher *none = cipher_by_name("none");
+	struct sshcipher *none = cipher_by_name("none");
 	int r;
 
 	if (none == NULL) {
@@ -954,6 +954,14 @@ ssh_set_newkeys(struct ssh *ssh, int mode)
 	return 0;
 }
 
+/* this supports the forced rekeying required for the NONE cipher */
+int rekey_requested = 0;
+void
+packet_request_rekeying(void)
+{
+        rekey_requested = 1;
+}
+
 #define MAX_PACKETS	(1U<<31)
 static int
 ssh_packet_need_rekeying(struct ssh *ssh, u_int outbound_packet_len)
@@ -980,6 +988,13 @@ ssh_packet_need_rekeying(struct ssh *ssh, u_int outbound_packet_len)
 	if (state->p_send.packets == 0 && state->p_read.packets == 0)
 		return 0;
 
+        /* used to force rekeying when called for by the none
+         * cipher switch and aes-mt-ctr methods -cjr */
+        if (rekey_requested == 1) {
+                rekey_requested = 0;
+                return 1;
+        }
+	
 	/* Time-based rekeying */
 	if (state->rekey_interval != 0 &&
 	    (int64_t)state->rekey_time + state->rekey_interval <= monotime())
@@ -2707,3 +2722,10 @@ sshpkt_add_padding(struct ssh *ssh, u_char pad)
 	ssh->state->extra_pad = pad;
 	return 0;
 }
+
+/* need this for the moment for the aes-ctr cipher */
+void *
+ssh_packet_get_send_context(struct ssh *ssh)
+{
+        return ssh->state->send_context;
+}
diff --git a/packet.h b/packet.h
index c2544bd9..87f43160 100644
--- a/packet.h
+++ b/packet.h
@@ -169,6 +169,9 @@ time_t	 ssh_packet_get_rekey_timeout(struct ssh *);
 
 void	*ssh_packet_get_input(struct ssh *);
 void	*ssh_packet_get_output(struct ssh *);
+void	*ssh_packet_get_receive_context(struct ssh *);
+void	*ssh_packet_get_send_context(struct ssh *);
+void    packet_request_rekeying(void);
 
 /* new API */
 int	sshpkt_start(struct ssh *ssh, u_char type);
diff --git a/readconf.c b/readconf.c
index 724974b7..808a8521 100644
--- a/readconf.c
+++ b/readconf.c
@@ -166,6 +166,7 @@ typedef enum {
 	oHashKnownHosts,
 	oTunnel, oTunnelDevice,
 	oLocalCommand, oPermitLocalCommand, oRemoteCommand,
+	oDisableMTAES,
 	oVisualHostKey,
 	oKexAlgorithms, oIPQoS, oRequestTTY, oIgnoreUnknown, oProxyUseFdpass,
 	oCanonicalDomains, oCanonicalizeHostname, oCanonicalizeMaxDots,
@@ -293,6 +294,7 @@ static struct {
 	{ "localcommand", oLocalCommand },
 	{ "permitlocalcommand", oPermitLocalCommand },
 	{ "remotecommand", oRemoteCommand },
+	{ "disablemtaes", oDisableMTAES },
 	{ "visualhostkey", oVisualHostKey },
 	{ "kexalgorithms", oKexAlgorithms },
 	{ "ipqos", oIPQoS },
@@ -1101,6 +1103,10 @@ parse_time:
 		multistate_ptr = multistate_strict_hostkey;
 		goto parse_multistate;
 
+	case oDisableMTAES:
+		intptr = &options->disable_multithreaded;
+		goto parse_flag;
+
 	case oCompression:
 		intptr = &options->compression;
 		multistate_ptr = multistate_compression;
@@ -2272,6 +2278,7 @@ initialize_options(Options * options)
 	options->revoked_host_keys = NULL;
 	options->fingerprint_hash = -1;
 	options->update_hostkeys = -1;
+	options->disable_multithreaded = -1;
 	options->hostbased_accepted_algos = NULL;
 	options->pubkey_accepted_algos = NULL;
 	options->known_hosts_command = NULL;
@@ -2467,6 +2474,10 @@ fill_default_options(Options * options)
 	if (options->sk_provider == NULL)
 		options->sk_provider = xstrdup("$SSH_SK_PROVIDER");
 #endif
+	if (options->update_hostkeys == -1)
+		options->update_hostkeys = 0;
+	if (options->disable_multithreaded == -1)
+		options->disable_multithreaded = 0;
 
 	/* Expand KEX name lists */
 	all_cipher = cipher_alg_list(',', 0);
diff --git a/readconf.h b/readconf.h
index 2fba866e..7f8f0227 100644
--- a/readconf.h
+++ b/readconf.h
@@ -120,6 +120,7 @@ typedef struct {
 
 	int	enable_ssh_keysign;
 	int64_t rekey_limit;
+	int     disable_multithreaded; /*disable multithreaded aes-ctr*/
 	int	rekey_interval;
 	int	no_host_authentication_for_localhost;
 	int	identities_only;
diff --git a/servconf.c b/servconf.c
index 9695583a..09fe65c9 100644
--- a/servconf.c
+++ b/servconf.c
@@ -92,6 +92,7 @@ initialize_server_options(ServerOptions *options)
 
 	/* Portable-specific options */
 	options->use_pam = -1;
+	options->disable_multithreaded = -1;
 
 	/* Standard Options */
 	options->num_ports = 0;
@@ -424,6 +425,8 @@ fill_default_server_options(ServerOptions *options)
 	}
 	if (options->permit_tun == -1)
 		options->permit_tun = SSH_TUNMODE_NO;
+	if (options->disable_multithreaded == -1)
+		options->disable_multithreaded = 0;
 	if (options->ip_qos_interactive == -1)
 		options->ip_qos_interactive = IPTOS_DSCP_AF21;
 	if (options->ip_qos_bulk == -1)
@@ -487,6 +490,7 @@ typedef enum {
 	sBadOption,		/* == unknown option */
 	/* Portable-specific options */
 	sUsePAM,
+	sDisableMTAES,
 	/* Standard Options */
 	sPort, sHostKeyFile, sLoginGraceTime,
 	sPermitRootLogin, sLogFacility, sLogLevel, sLogVerbose,
@@ -662,6 +666,7 @@ static struct {
 	{ "authorizedprincipalsfile", sAuthorizedPrincipalsFile, SSHCFG_ALL },
 	{ "kexalgorithms", sKexAlgorithms, SSHCFG_GLOBAL },
 	{ "include", sInclude, SSHCFG_ALL },
+	{ "disableMTAES", sDisableMTAES, SSHCFG_ALL },
 	{ "ipqos", sIPQoS, SSHCFG_ALL },
 	{ "authorizedkeyscommand", sAuthorizedKeysCommand, SSHCFG_ALL },
 	{ "authorizedkeyscommanduser", sAuthorizedKeysCommandUser, SSHCFG_ALL },
@@ -2221,6 +2226,10 @@ process_server_config_line_depth(ServerOptions *options, char *line,
 		}
 		break;
 
+	case sDisableMTAES:
+		intptr = &options->disable_multithreaded;
+		goto parse_flag;
+		
 	case sIPQoS:
 		arg = strdelim(&cp);
 		if ((value = parse_ipqos(arg)) == -1)
diff --git a/servconf.h b/servconf.h
index 4f4fd9ba..16875eb6 100644
--- a/servconf.h
+++ b/servconf.h
@@ -199,6 +199,8 @@ typedef struct {
 
 	char   *adm_forced_command;
 
+	int     disable_multithreaded;  /*disable multithreaded aes-ctr*/
+
 	int	use_pam;		/* Enable auth via PAM */
 
 	int	permit_tun;
diff --git a/ssh.c b/ssh.c
index 53330da5..54465b49 100644
--- a/ssh.c
+++ b/ssh.c
@@ -1740,6 +1740,8 @@ control_persist_detach(void)
 	setproctitle("%s [mux]", options.control_path);
 }
 
+extern const EVP_CIPHER *evp_aes_ctr_mt(void);
+
 /* Do fork() after authentication. Used by "ssh -f" */
 static void
 fork_postauth(void)
diff --git a/sshconnect2.c b/sshconnect2.c
index 059c9480..04ca0e2e 100644
--- a/sshconnect2.c
+++ b/sshconnect2.c
@@ -489,6 +489,26 @@ ssh_userauth2(struct ssh *ssh, const char *local_user,
 
 	if (!authctxt.success)
 		fatal("Authentication failed.");
+
+#ifdef WITH_OPENSSL
+	if (options.disable_multithreaded == 0) {
+		/* if we are using aes-ctr there can be issues in either a fork or sandbox
+		 * so the initial aes-ctr is defined to point to the original single process
+		 * evp. After authentication we'll be past the fork and the sandboxed privsep
+		 * so we repoint the define to the multithreaded evp. To start the threads we
+		 * then force a rekey
+		 */
+		const void *cc = ssh_packet_get_send_context(ssh);
+		
+		/* only do this for the ctr cipher. otherwise gcm mode breaks. Don't know why though */
+		if (strstr(cipher_ctx_name(cc), "ctr")) {
+			debug("Single to Multithread CTR cipher swap - client request");
+			cipher_reset_multithreaded();
+			packet_request_rekeying();
+		}
+	}
+#endif
+
 	debug("Authentication succeeded (%s).", authctxt.method->name);
 }
 
diff --git a/sshd.c b/sshd.c
index 6277e6d6..bf3d6e4a 100644
--- a/sshd.c
+++ b/sshd.c
@@ -2273,6 +2273,25 @@ main(int ac, char **av)
 	/* Try to send all our hostkeys to the client */
 	notify_hostkeys(ssh);
 
+#ifdef WITH_OPENSSL
+	if (options.disable_multithreaded == 0) {
+		/* if we are using aes-ctr there can be issues in either a fork or sandbox
+		 * so the initial aes-ctr is defined to point ot the original single process
+		 * evp. After authentication we'll be past the fork and the sandboxed privsep
+		 * so we repoint the define to the multithreaded evp. To start the threads we
+		 * then force a rekey
+		 */
+		const void *cc = ssh_packet_get_send_context(the_active_state);
+		
+		/* only rekey if necessary. If we don't do this gcm mode cipher breaks */
+		if (strstr(cipher_ctx_name(cc), "ctr")) {
+			debug("Single to Multithreaded CTR cipher swap - server request");
+			cipher_reset_multithreaded();
+		packet_request_rekeying();
+		}
+	}
+#endif
+
 	/* Start session. */
 	do_authenticated(ssh, authctxt);
 
