單一維度畫圖-長條圖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# coding:utf-8 import matplotlib.pyplot as plt import random,math from probability import inverse_normal_cdf from collections import Counter print "---單維分群畫圖長條圖---" def bucketize(point, bucket_size): """floor the point to the next lower multiple of bucket_size""" return bucket_size * math.floor(point / bucket_size) def make_histogram(points, bucket_size): """buckets the points and counts how many in each bucket""" return Counter(bucketize(point, bucket_size) for point in points) def plot_histogram(points, bucket_size, title=""): histogram = make_histogram(points, bucket_size) plt.bar(histogram.keys(), histogram.values(), width=bucket_size) plt.title(title) plt.show() def compare_two_distributions(): random.seed(0) uniform = [random.randrange(-100,101) for _ in range(200)] normal = [57 * inverse_normal_cdf(random.random()) for _ in range(200)] # 新增一筆資料 data1 = [30,18,50,80,10,60,19,64,25,78,12,47,60,13,90,100] # 資料畫圖 plot_histogram(uniform, 10, "Uniform Histogram") plot_histogram(normal, 10, "Normal Histogram") plot_histogram(data1, 10, "My Data") compare_two_distributions() |
兩種不同的維度畫圖-散點圖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
from linear_algebra import shape,get_column def random_normal(): """returns a random draw from a standard normal distribution""" return inverse_normal_cdf(random.random()) xs = [random_normal() for _ in range(1000)] ys1 = [ x + random_normal() / 2 for x in xs] # 調整x變數讓圖形變化 ys2 = [-x+10 + random_normal() / 2 for x in xs] def scatter(): plt.scatter(xs, ys1, marker='.', color='black', label='ys1') plt.scatter(xs, ys2, marker='.', color='gray', label='ys2') plt.xlabel('xs') plt.ylabel('ys') plt.legend(loc=9) plt.show() scatter() |
由兩種不同的維度畫出的ys1與ys2
兩種以上的維度畫圖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
def make_scatterplot_matrix(): # first, generate some random data num_points = 100 def random_row(): #可透過以下調整出有種不同的圖表 row = [None, None, None, None, None, None] row[0] = random_normal() row[1] = -5 * row[0] + random_normal() row[2] = row[0] + row[1] + 5 * random_normal() row[3] = 6 if row[2] > -2 else 0 row[4] = 5 * row[0] + random_normal() row[5] = 0 return row random.seed(0) data = [random_row() for _ in range(num_points)] # then plot it _, num_columns = shape(data) fig, ax = plt.subplots(num_columns, num_columns) for i in range(num_columns): for j in range(num_columns): # scatter column_j on the x-axis vs column_i on the y-axis if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i)) # unless i == j, in which case show the series name else: ax[i][j].annotate("series " + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va="center") # then hide axis labels except left and bottom charts if i < num_columns - 1: ax[i][j].xaxis.set_visible(False) if j > 0: ax[i][j].yaxis.set_visible(False) # fix the bottom right and top left axis labels, which are wrong because # their charts only have text in them ax[-1][-1].set_xlim(ax[0][-1].get_xlim()) ax[0][0].set_ylim(ax[0][1].get_ylim()) plt.show() make_scatterplot_matrix() |
兩種以上維度畫圖
資料檢查
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import csv,dateutil data = [] def try_or_none(f): """wraps f to return None if f raises an exception assumes f takes only one input""" def f_or_none(x): try: return f(x) except: return None return f_or_none def parse_row(input_row, parsers): return [try_or_none(parser)(value) if parser is not None else value for value, parser in zip(input_row, parsers)] def parse_rows_with(reader, parsers): """wrap a reader to apply the parsers to each of its rows""" for row in reader: yield parse_row(row, parsers) with open("comma_delimited_stock_prices.csv", "rb") as f: reader = csv.reader(f) for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]): # 有問題的資料就不要加至data if any(x is None for x in line): pass else: data.append(line) print data # 把不符合的值印出來 for row in data: if any(x is None for x in row): print row |
將不符合的資料印出來
資料量測單位統一,若資料不統一,可能發生以下幾種狀況
以下是ABC 3人的身高體重
.
Height (inches) Height (centimeters) Weight
A 63 inches 160 cm 150 pounds
B 67 inches 170.2 cm 160 pounds
C 70 inches 177.8 cm 171 pounds
.
若單位為英尺,B會丟至distance方法會較靠近於A
a_to_b = distance([63, 150], [67, 160]) # A與B的距離 10.77
a_to_c = distance([63, 150], [70, 171]) # A與C的距離 22.14
b_to_c = distance([67, 160], [70, 171]) # B與C的距離 11.40
.
若單位為英尺,B會丟至distance方法會較靠近於C
a_to_b = distance([160, 150], [170.2, 160]) # A與B的距離 14.28
a_to_c = distance([160, 150], [177.8, 171]) # A與C的距離 27.53
b_to_c = distance([170.2, 160], [177.8, 171]) # B與C的距離 13.37
.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
from linear_algebra import make_matrix from statistics import standard_deviation, mean def scale(data_matrix): num_rows, num_cols = shape(data_matrix) means = [mean(get_column(data_matrix,j)) for j in range(num_cols)] stdevs = [standard_deviation(get_column(data_matrix,j)) for j in range(num_cols)] return means, stdevs def rescale(data_matrix): """rescales the input data so that each column has mean 0 and standard deviation 1 ignores columns with no deviation""" means, stdevs = scale(data_matrix) def rescaled(i, j): if stdevs[j] > 0: return (data_matrix[i][j] - means[j]) / stdevs[j] else: return data_matrix[i][j] num_rows, num_cols = shape(data_matrix) return make_matrix(num_rows, num_cols, rescaled) data = [[1, 20, 2], [1, 30, 3], [1, 40, 4]] print "original: ", data print "scale: ", scale(data) print "rescaled: ", rescale(data) |
備註:2017/05/04 計算方法分析與設計 課堂筆記