- 資料分割
建立 emil.txt
aa@gmail.com bb@yahoo.com cc@gm.nfu.edu.tw a1@gmail.com a2@gmail.com a3@gmail.com a4@gmail.com |
用來分段@以後的網域
1 2 3 4 5 6 7 8 9 10 11 12 |
# 用字串分割取得網域名 from collections import Counter def get_domain(email_address): """split on '@' and return the last piece""" return email_address.lower().split("@")[-1] with open('email.txt', 'r') as f: domain_counts = Counter(get_domain(line.strip()) for line in f if "@" in line) print domain_counts |
先建立輸出 function
1 2 3 4 5 6 7 8 9 10 11 12 |
def process(date, symbol, price): print date, symbol, price # tab切割 print "---tab delimited stock prices---" with open('tab_delimited_stock_prices.txt', 'rb') as f: reader = csv.reader(f, delimiter='\t') for row in reader: date = row[0] symbol = row[1] closing_price = float(row[2]) process(date, symbol, closing_price) |
1 2 3 4 5 6 7 8 9 |
# :分割 print "---colon delimited stock prices---" with open('colon_delimited_stock_prices.txt', 'rb') as f: reader = csv.DictReader(f, delimiter=':') for row in reader: # 有給欄位名稱可以用欄位名稱去讀取 date = row["date"] symbol = row["symbol"] closing_price = float(row["closing_price"]) process(date, symbol, closing_price) |
# 寫檔用,隔開,也可用tab \t
1 2 3 4 5 6 |
print "---writing out comma_delimited_stock_prices.txt---" today_prices = { 'Chinese' : 90.5, 'English' : 41.68, 'Math' : 64.5 } with open('comma_delimited_stock_prices.txt', 'wb') as f: #檔名 writer = csv.writer(f, delimiter=',') # 也可用\t for stock, price in today_prices.items(): writer.writerow([stock, price]) |
寫檔結果
- 網頁爬蟲 requests, BeautifulSoup,re
# 讀取網址
1 2 3 4 5 6 7 |
from bs4 import BeautifulSoup import requests print "BeautifulSoup" html = requests.get("http://www.nfu.edu.tw").text soup = BeautifulSoup(html) print soup |
# 抓取tag標籤與用正規表達式
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import re def book_info(td): """given a BeautifulSoup <td> Tag representing a book, extract the book's details and return a dict""" title = td.find("div", "thumbheader").a.text by_author = td.find('div', 'AuthorName').text authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")] isbn_link = td.find("div", "thumbheader").a.get("href") isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0] date = td.find("span", "directorydate").text.strip() return { "title": title, "authors": authors, "isbn": isbn, "date": date } # 判斷是否為 video from time import sleep def is_video(td): """it's a video if it has exactly one pricelabel, and if the stripped text inside that pricelabel starts with 'Video'""" pricelabels = td('span', 'pricelabel') return (len(pricelabels) == 1 and pricelabels[0].text.strip().startswith("Video")) # 爬取網頁,共有31頁,每頁抓完讓他休息3秒 def scrape(num_pages=31): base_url = "<a href="http://shop.oreilly.com/category/browse-subjects/">http://shop.oreilly.com/category/browse-subjects/</a>" + \ "data.do?sortby=publicationDate&page=" books = [] for page_num in range(1, num_pages + 1): print "souping page", page_num url = base_url + str(page_num) soup = BeautifulSoup(requests.get(url).text, 'html5lib') for td in soup('td', 'thumbtext'): if not is_video(td): books.append(book_info(td)) # now be a good citizen and respect the robots.txt! sleep(3) print len(books) # 印出有幾本書 return books scrape() |
PS:要先裝html5lib
到pycharm裡面的Terminal中打 conda install html5lib安裝
裝好後就執行
總共有732本書
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
<span class="Normal-H"><span style="font-family: Calibri;"># </span><span style="font-family: 新細明體;">計算</span><span style="font-family: Calibri;">date</span><span style="font-family: 新細明體;">的年份</span></span> def get_year(book): """book["date"] looks like 'November 2014' so we need to split on the space and then take the second piece""" return int(book["date"].split()[1]) # 將年分與books個數用matplotlib畫圖 from matplotlib import pyplot as plt def plot_years(plt, books): # 2014 is the last complete year of data (when I ran this) year_counts = Counter(get_year(book) for book in books if get_year(book) <= 2017) years = sorted(year_counts) book_counts = [year_counts[year] for year in years] plt.bar([x - 0.5 for x in years], book_counts) plt.xlabel("year") plt.ylabel("# of data books") plt.title("Data is Big!") plt.show() plot_years(plt,scrape()) |
# 解析 json
1 2 3 4 5 6 7 8 9 |
import json serialized = """{ "title" : "Data Science Book", "author" : "Joel Grus", "publicationYear" : 2014, "topics" : [ "data", "science", "data science"] }""" # parse the JSON to create a Python object deserialized = json.loads(serialized) if "data science" in deserialized["topics"]: print deserialized |
# 透過 github API撈取,並抓repository
1 2 3 4 5 |
endpoint = "<a href="https://api.github.com/users/mauriceHsiao/repos">https://api.github.com/users/mauriceHsiao/repos</a>" repos = json.loads(requests.get(endpoint).text) for rows in repos: data1 = rows["name"] # repository print data1 |
我的repository有
備註:2017/04/27 計算方法分析與設計 課堂筆記