ssl payload取1024字节,然后使用VAE检测异常的ssl流。
代码如下:
from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerimport numpy as npimport tensorflow as tfimport tflearnfrom matplotlib import pyplot as pltimport seaborn as snsfrom sklearn.metrics import confusion_matriximport pandas as pdfrom sklearn.metrics import average_precision_score, recall_score, precision_score, f1_scoreimport osfrom PIL import Imagedef report_evaluation_metrics(y_true, y_pred): average_precision = average_precision_score(y_true, y_pred) precision = precision_score(y_true, y_pred, labels=[0, 1], pos_label=1) recall = recall_score(y_true, y_pred, labels=[0, 1], pos_label=1) f1 = f1_score(y_true, y_pred, labels=[0, 1], pos_label=1) print('Average precision-recall score: {0:0.2f}'.format(average_precision)) print('Precision: {0:0.2f}'.format(precision)) print('Recall: {0:0.2f}'.format(recall)) print('F1: {0:0.2f}'.format(f1))LABELS = ["Normal", "Fraud"]def plot_confusion_matrix(y_true, y_pred): conf_matrix = confusion_matrix(y_true, y_pred) plt.figure(figsize=(12, 12)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.show()def plot_training_history(history): if history is None: return plt.plot(history['loss']) plt.plot(history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper right') plt.show()def visualize_anomaly(y_true, reconstruction_error, threshold): error_df = pd.DataFrame({'reconstruction_error': reconstruction_error, 'true_class': y_true}) print(error_df.describe()) groups = error_df.groupby('true_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='', label="Fraud" if name == 1 else "Normal") ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.show()def visualize_reconstruction_error(reconstruction_error, threshold): plt.plot(reconstruction_error, marker='o', ms=3.5, linestyle='', label='Point') plt.hlines(threshold, xmin=0, xmax=len(reconstruction_error) - 1, colors="r", zorder=100, label='Threshold') plt.legend() plt.title("Reconstruction error") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.show()def get_images(): image_list = [] files = [] cnt = 0 img_dir = "png2" for file in os.listdir(img_dir): path = os.path.join(img_dir, file) if not os.path.isfile(path): print("{} is not a file!!!".format(path)) continue cnt += 1 temp_image = Image.open(path).convert('L') # temp_image = temp_image.resize((32, 32), Image.ANTIALIAS) temp_image = np.asarray(temp_image) / 255.0 image_list.append(temp_image) files.append(file) image_list = np.asarray(image_list) input_image = image_list.reshape([cnt, 32, 32, 1]) return input_image, np.array(files)def preprocess_data(csv_data): credit_card_data = csv_data.drop(labels=['Class', 'Time'], axis=1) credit_card_data['Amount'] = StandardScaler().fit_transform(credit_card_data['Amount'].values.reshape(-1, 1)) # print(credit_card_data.head()) credit_card_np_data = credit_card_data.as_matrix() y_true = csv_data['Class'].as_matrix() return credit_card_np_data, y_true# encoderdef encode(input_x, encoder_hidden_dim, latent_dim): """ # keras# build encoder modelinputs = Input(shape=input_shape, name='encoder_input')x = Dense(intermediate_dim, activation='relu')(inputs)z_mean = Dense(latent_dim, name='z_mean')(x)z_log_var = Dense(latent_dim, name='z_log_var')(x) """ encoder = tflearn.fully_connected(input_x, encoder_hidden_dim, activation='relu') mu_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear') logvar_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear') return mu_encoder, logvar_encoder# decoderdef decode(z, decoder_hidden_dim, input_dim): """# build decoder modellatent_inputs = Input(shape=(latent_dim,), name='z_sampling')x = Dense(intermediate_dim, activation='relu')(latent_inputs)outputs = Dense(original_dim, activation='sigmoid')(x) """ decoder = tflearn.fully_connected(z, decoder_hidden_dim, activation='relu') x_hat = tflearn.fully_connected(decoder, input_dim, activation='linear') return x_hat# samplerdef sample(mu, logvar): """ keras z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) # reparameterization trick# instead of sampling from Q(z|X), sample eps = N(0,I)# z = z_mean + sqrt(var)*epsdef sampling(args): z_mean, z_log_var = args batch = K.shape(z_mean)[0] dim = K.int_shape(z_mean)[1] # by default, random_normal has mean=0 and std=1.0 epsilon = K.random_normal(shape=(batch, dim)) return z_mean + K.exp(0.5 * z_log_var) * epsilon """ epsilon = tf.random_normal(tf.shape(logvar), dtype=tf.float32, name='epsilon') # std_encoder = tf.exp(tf.mul(0.5, logvar)) # z = tf.add(mu, tf.mul(std_encoder, epsilon)) z = mu + tf.exp(logvar / 2) * epsilon return z# loss function(regularization)def calculate_regularization_loss(mu, logvar): kl_divergence = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mu) - tf.exp(logvar), reduction_indices=1) return kl_divergence# loss function(reconstruction)def calculate_reconstruction_loss(x_hat, input_x): mse = tflearn.objectives.mean_square(x_hat, input_x) return msedef main(): anomaly_ratio = 0.0001 estimated_negative_sample_ratio = 1 - anomaly_ratio print(estimated_negative_sample_ratio) data_file = "data.npz" if os.path.exists(data_file): print("load data file data.npz!!!") data = np.load(data_file) X, files = data['X'], data['files'] else: X, files = get_images() np.savez(data_file, X=X, files=files) X = X.reshape([len(X), 32*32]) trainX, testX, trainY, testY = train_test_split(X, X, test_size=0.05, random_state=42) print("sample data: X:{} ".format(X[:3])) print(X.shape) # detect anomaly for the test data Ypred = [] # blackY_indices = np.where(Y)[0] # print(blackY_indices[:3], "sample fraud credit data") # assert Y[blackY_indices[0]] # assert Y[blackY_indices[-1]] # X, Y, testX, testY = mnist.load_data(one_hot=True) # Params original_dim = len(X[0]) # MNIST images are 28x28 pixels print("dim: {}".format(original_dim)) """ # Building the encoder encoder = tflearn.input_data(shape=[None, original_dim]) encoder = tflearn.fully_connected(encoder, 8) encoder = tflearn.fully_connected(encoder, 4) # Building the decoder decoder = tflearn.fully_connected(encoder, 8) decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid') # Regression, with mean square error net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.001, loss='mean_square', metric=None) # Training the auto encoder training_model = tflearn.DNN(net, tensorboard_verbose=0) training_model.fit(X, X, n_epoch=100, validation_set=(testX, testX), run_id="auto_encoder", batch_size=256) """ # hidden_dim = 8 # original_dim//2 # latent_dim = 4 # original_dim = 784 # MNIST images are 28x28 pixels hidden_dim = 256 latent_dim = 2 input_x = tflearn.input_data(shape=(None, original_dim), name='input_x') mu, logvar = encode(input_x, hidden_dim, latent_dim) z = sample(mu, logvar) x_hat = decode(z, hidden_dim, original_dim) regularization_loss = calculate_regularization_loss(mu, logvar) reconstruction_loss = calculate_reconstruction_loss(x_hat, input_x) target = tf.reduce_mean(tf.add(regularization_loss, reconstruction_loss)) net = tflearn.regression(x_hat, optimizer='rmsprop', learning_rate=0.001, loss=target, metric=None, name='target_out') # We will need 2 models, one for training that will learn the latent # representation, and one that can take random normal noise as input and # use the decoder part of the network to generate an image # Train the VAE training_model = tflearn.DNN(net, tensorboard_verbose=0) model_file = "model.tflearn" if os.path.exists(model_file + ".meta"): print("Load a model from local!!!") training_model.load(model_file) else: # pass training_model.fit({'input_x': trainX}, {'target_out': trainX}, n_epoch=30, validation_set=(testX, testX), batch_size=256, run_id="vae") training_model.save(model_file) """ # Build an image generator (re-using the decoding layers) # Input data is a normal (gaussian) random distribution (with dim = latent_dim) # input_noise = tflearn.input_data(shape=[None, latent_dim], name='input_noise') # decoder = tflearn.fully_connected(input_noise, hidden_dim, activation='relu', # scope='decoder_h', reuse=True) # decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid', # scope='decoder_out', reuse=True) # just for generate new data # generator_model = tflearn.DNN(decoder, session=training_model.session) """ print("training sample predict:") print(training_model.predict(X[:3])) # pred_x_test = training_model.predict(testX) reconstruction_error = [] anomaly_information, adjusted_threshold = get_anomaly(training_model, X, estimated_negative_sample_ratio) tp = fp = tn = fn = 0 # blackY_indices = set(blackY_indices) for idx, (is_anomaly, dist) in enumerate(anomaly_information): if is_anomaly: print(files[idx], dist) predicted_label = 1 if is_anomaly else 0 Ypred.append(predicted_label) reconstruction_error.append(dist) # print("blackY_indices len:{} detectd cnt:{}, true attack cnt:{}".format(len(blackY_indices), tp + fn, tp)) # precision = float(tp) / (tp + fp) # hit_rate = float(tp) / (tp + fn) # accuracy = float(tp + tn) / (tp + tn + fp + fn) # print('precision = {}, hit_rate = {}, accuracy = {}'.format(precision, hit_rate, accuracy)) # report_evaluation_metrics(Y, Ypred) # plot_training_history(history) # visualize_anomaly(X, reconstruction_error, adjusted_threshold) # plot_confusion_matrix(Y, Ypred)def get_anomaly(model, data, estimated_negative_sample_ratio): target_data = model.predict(data) scores = np.linalg.norm(data - target_data, axis=-1) scores2 = np.array(scores) """ np.linalg.norm(np.array([[1,1,1],[2,2,2]])-np.array([[0,0,0],[0,0,0]]),axis=-1) array([1.73205081, 3.46410162]) >>> 3.46*3.46 11.9716 """ scores.sort() cut_point = int(estimated_negative_sample_ratio * len(scores)) threshold = scores[cut_point] print('estimated threshold is ' + str(threshold)) return zip(scores2 >= threshold, scores2), thresholdif __name__ == '__main__': main()
然后出了一大堆误报,蛋疼!!!
estimated threshold is 15.532261382449361('tls-SSL-HTTPS-Network-Infrastructure-10.2.211.75-61.174.11.239-6df25bceb243184a00000000.png', '15.589723319043824')('tls-SSL-HTTPS-Network-Infrastructure-10.128.200.15-8.253.246.123-49d05bce2072185500000000.png', '15.556322765856306')('tls-SSL-HTTPS-Network-Infrastructure-10.2.6.172-112.120.33.141-2ed75bcec42b187a00000000.png', '15.544285847781069')('tls-SSL-HTTPS-Network-Infrastructure-10.0.96.216-124.127.247.234-d2505bcebc00187400000000.png', '15.536370031106207')('tls-SSL-HTTPS-Network-Infrastructure-10.128.4.53-123.59.148.55-2f405bce0fcf180100000000.png', '15.545930457909789')('tls-SSL-HTTPS-Network-Infrastructure-10.2.5.105-124.202.189.145-7cea5bceb99f231a00000000.png', '15.542118064275328')('tls-SSL-HTTPS-Network-Infrastructure-10.2.5.105-124.202.189.104-c4615bce7b30181400000000.png', '15.643245500742289')('tls-SSL-HTTPS-Network-Infrastructure-10.2.84.163-58.205.212.208-fc635bce84dc237100000000.png', '15.53807329897178')('tls-SSL-HTTPS-Network-Infrastructure-10.2.69.67-88.208.61.141-88ba5bce082c187400000000.png', '15.578754079909734')
难道发现恶意的ssl流很难???换成CNN auto encoder试试后,直接将1024字节的ssl流看成32*32的图像进行处理:
on_server = Falseif on_server: import matplotlib matplotlib.use('Agg')from keras.models import Sequentialfrom keras.layers import Dense, Activation, Flattenfrom keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2Dfrom keras.models import Modelfrom keras.models import load_modelimport matplotlib.pyplot as pltfrom keras import backend as Kimport osfrom PIL import Imagefrom sklearn.model_selection import train_test_splitimport numpy as npfrom sklearn.ensemble import IsolationForestdef get_images(): image_list = [] files = [] cnt = 0 img_dir = "png2" for file in os.listdir(img_dir): path = os.path.join(img_dir, file) if not os.path.isfile(path): print("{} is not a file!!!".format(path)) continue cnt += 1 temp_image = Image.open(path).convert('L') # temp_image = temp_image.resize((32, 32), Image.ANTIALIAS) temp_image = np.asarray(temp_image) / 255.0 image_list.append(temp_image) files.append(file) image_list = np.asarray(image_list) input_image = image_list.reshape([cnt, 32, 32, 1]) return input_image, np.array(files)def get_cnn_model(): model = Sequential() # 1st convolution layer model.add(Conv2D(16, (3, 3) # 16 is number of filters and (3, 3) is the size of the filter. , padding='same', input_shape=(32, 32, 1))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) # 2nd convolution layer model.add(Conv2D(2, (3, 3), padding='same')) # apply 2 filters sized of (3x3) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) # ------------------------- # 3rd convolution layer model.add(Conv2D(2, (3, 3), padding='same')) # apply 2 filters sized of (3x3) model.add(Activation('relu')) model.add(UpSampling2D((2, 2))) # 4rd convolution layer model.add(Conv2D(16, (3, 3), padding='same')) model.add(Activation('relu')) model.add(UpSampling2D((2, 2))) # ------------------------- model.add(Conv2D(1, (3, 3), padding='same')) model.add(Activation('sigmoid')) print(model.summary()) model.compile(optimizer='adadelta', loss='binary_crossentropy') return modeldata_file = "data.npz"if os.path.exists(data_file): print("load data file data.npz!!!") data = np.load(data_file) X, files = data['X'], data['files']else: X, files = get_images() np.savez(data_file, X=X, files=files)x_train, x_test, y_train, y_test = train_test_split(X, X, test_size=0.05, random_state=42)model_file = 'model.h5'if os.path.exists(model_file): print("found model, load it from disk!!!") model = load_model('model.h5')else: model = get_cnn_model()# resume trainingmodel.fit(x_train, x_train, epochs=30, batch_size=1024, validation_data=(x_test, x_test))model.save(model_file)restored_imgs = model.predict(x_test)print("just see some test:")for i in range(5): print(x_test[i]) plt.imshow(x_test[i].reshape(32, 32)) plt.gray() if on_server: plt.savefig("test-{}.png".format(i)) else: plt.show() print(x_test[i]) print(restored_imgs[i]) plt.imshow(restored_imgs[i].reshape(32, 32)) plt.gray() if on_server: plt.savefig("test-{}-restored.png".format(i)) else: plt.show() print("----------------------------")layers = len(model.layers)for i in range(layers): print(i, ". ", model.layers[i].output.get_shape())"""0 . (?, 28, 28, 16)1 . (?, 28, 28, 16)2 . (?, 14, 14, 16)3 . (?, 14, 14, 2)4 . (?, 14, 14, 2)5 . (?, 7, 7, 2)6 . (?, 7, 7, 2)7 . (?, 7, 7, 2)8 . (?, 14, 14, 2)9 . (?, 14, 14, 16)10 . (?, 14, 14, 16)11 . (?, 28, 28, 16)12 . (?, 28, 28, 1)13 . (?, 28, 28, 1)""""""(0, '. ', TensorShape([Dimension(None), Dimension(28), Dimension(28), Dimension(1)]))(1, '. ', TensorShape([Dimension(None), Dimension(28), Dimension(28), Dimension(16)]))(2, '. ', TensorShape([Dimension(None), Dimension(14), Dimension(14), Dimension(16)]))(3, '. ', TensorShape([Dimension(None), Dimension(14), Dimension(14), Dimension(8)]))(4, '. ', TensorShape([Dimension(None), Dimension(7), Dimension(7), Dimension(8)]))(5, '. ', TensorShape([Dimension(None), Dimension(7), Dimension(7), Dimension(8)]))(6, '. ', TensorShape([Dimension(None), Dimension(4), Dimension(4), Dimension(8)]))(7, '. ', TensorShape([Dimension(None), Dimension(4), Dimension(4), Dimension(8)]))(8, '. ', TensorShape([Dimension(None), Dimension(8), Dimension(8), Dimension(8)]))(9, '. ', TensorShape([Dimension(None), Dimension(8), Dimension(8), Dimension(8)]))(10, '. ', TensorShape([Dimension(None), Dimension(16), Dimension(16), Dimension(8)]))(11, '. ', TensorShape([Dimension(None), Dimension(14), Dimension(14), Dimension(16)]))(12, '. ', TensorShape([Dimension(None), Dimension(28), Dimension(28), Dimension(16)]))(13, '. ', TensorShape([Dimension(None), Dimension(28), Dimension(28), Dimension(1)]))"""#layer[7] is activation_3 (Activation), it is compressed representationget_3rd_layer_output = K.function([model.layers[0].input], [model.layers[7].output])"""# compressed = get_3rd_layer_output([x_test])[0]compressed = get_3rd_layer_output([X])[0]print(compressed[:3])#layer[7] is size of (None, 7, 7, 2). this means 2 different 7x7 sized matrixes. We will flatten these matrixes.compressed = compressed.reshape(len(X), 8*8*2)print("some sample data compressed:")print(compressed[:3])"""chunks = []N = 3000for i in range(0, len(X), N): chunk_data = X[i:i+N] print("chunk data length:", len(chunk_data)) compressed = get_3rd_layer_output([chunk_data])[0] chunk_compressed = compressed.reshape(len(chunk_data), 8 * 8 * 2) # print("len of compressed:", len(chunk_compressed)) chunks.append(chunk_compressed)compressed = np.concatenate(chunks)assert len(compressed) == len(files)print("some sample data compressed:")print(compressed[:3])rng = np.random.RandomState(42)# clf = IsolationForest(max_samples=10*2, random_state=rng)# clf = IsolationForest(max_features=5)clf = IsolationForest(max_samples="auto", random_state=rng, contamination=0.0001)clf.fit(compressed)pred_y = clf.predict(compressed)cnt = 0for i, y in enumerate(pred_y): if y == -1: print("bad data:", files[i]) cnt += 1 plt.imshow(X[i].reshape(32, 32)) plt.gray() if on_server: plt.savefig("anom-{}.png".format(files[i])) else: plt.show()print("cnt:{}".format(cnt))
然后检测的结果:
anom-tls-SSL-HTTPS-Network-Infrastructure-10.0.141.22-140.143.254.151-7a945bce6580183800000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.152.184-139.198.13.247-9b575bce61aa183900000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.152.229-54.243.242.217-5d035bce7ae2180100000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.153.170-58.205.220.35-90945bce62db237100000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.153.84-120.132.53.247-56955bce9e60181700000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.156.96-120.27.81.165-d1015bcea15c183400000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.158.185-111.30.138.183-18645bcea2de182f00000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.164.168-175.102.18.142-d42a5bce5eda180400000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.169.126-117.78.58.102-06b15bce6c0b182200000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.204.20-59.37.96.226-394a5bceafcd234800000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.210.113-207.148.117.221-5cac5bce7b51234600000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.210.126-151.101.76.223-eeb55bce6578233900000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.210.50-47.107.215.152-192d5bce7f3d237600000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.211.177-128.199.185.96-c0425bce77aa232900000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.230.241-180.153.222.195-301b5bce96aa185900000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.2.33-47.92.124.196-2cba5bd1b021185900000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.35.34-59.110.185.99-43975bcea358234100000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.40.147-203.100.92.177-ef7a5bce82f2181300000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.42.152-23.198.101.111-ddce5bce9021185200000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.42.216-67.216.207.162-19fc5bce712c184000000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.47.101-54.222.139.132-87465bceab54232b00000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.47.157-120.55.104.178-c6f25bce6358232100000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.48.226-59.37.96.226-0a5c5bce7a7a182c00000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.48.57-47.100.42.159-19995bce807b232e00000000.png.pnganom-tls-SSL-HTTPS-Network-Infrastructure-10.0.53.122-115.27.243.5-5bcb5bce8151183b00000000.png.png
没有查到几个是恶意的。。。真是有种想吐血的感觉!!!
接下来尝试下GAN进行异常检测,但是换一个思路了,不再是完全无监督思路,而是先过滤出异常的ssl,然后使用GAN来检测类似的异常。