完整代码
import requestsfrom bs4 import BeautifulSoupimport refrom operator import itemgetterimport timeimport randomimport pandas as pddef remove_col(arr, ith): itg = itemgetter(filter((ith).__ne__, range(len(arr[0])))) return list(map(list, map(itg, arr))) url = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodes'# http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodesheads = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}resText = requests.get(url)soup = BeautifulSoup(resText.content, features='lxml') s = soup.textprint('\n申万一级分类:') shw1 = s[s.find('swhy'):s.find('sw1_hy')]shw1_cut = shw1[shw1.find('[['):shw1.find(']]')]shw1_cut = re.sub(r'\[','',shw1_cut)shw1_cut = re.sub(r'"','',shw1_cut)shw1_list = shw1_cut.split(']')shw1_list_split = []for i in range(0,len(shw1_list)): item_split = shw1_list[i].split(',') if i == 0: temp_str = item_split[0].encode('utf-8').decode('unicode_escape') item_split[0] = temp_str else: temp_str = item_split[1].encode('utf-8').decode('unicode_escape') item_split[1] = temp_str item_split = item_split[1:4] shw1_list_split.append(item_split) result_shw1 = remove_col(shw1_list_split, 1)print()print('申万一级分类总数:',len(result_shw1))print(result_shw1)print()## 申万一级分类及其各分类下的股票,print('申万一级及其所属股票')shw1_category_and_stocks = []shw1_categorystock = []for i in range(0,len(result_shw1)): s2 = '' page_i = 1 while True: # 实例: https://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=1&num=500&sort=symbol&asc=1&node=sw1_270000&symbol=&_s_r_a=init url2 = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page='+str(page_i)+'&num=200&sort=symbol&asc=1&node=' + result_shw1[i][1][0:11] + '&symbol=&_s_r_a=init' #'http://vip.stock.finance.sina.com.cn/mkt/#sw2_730100' # # print(url2,i,result_shw1[i][0],result_shw1[i][1][0:11]) print(i,result_shw1[i][0],result_shw1[i][1][0:11]) resText2 = requests.get(url2) soup2 = BeautifulSoup(resText2.content, features='lxml') if len(soup2.text) > 10: current_s = soup2.text s2 = s2 + current_s # '\n,'+ page_i = page_i + 1 else: break print('------------------------------------------------------') resStr2 = re.sub(r'\[','',s2) resStr2 = re.sub(r'\]','',resStr2) resStr2 = re.sub(r'{','',resStr2) resStr2_list = resStr2.split('}') resStr2_list.pop() # 删除最后一个元素,由于split产生的空元素 shw_one_stocks = [] for j in range(0, len(resStr2_list)): singlestock_info = resStr2_list[j].split(',') if len(singlestock_info) == 20: rst = [[x for x in ss.split(':')] for ss in singlestock_info] shw_one_stocks.append([rst[0][1][0:len(rst[0][1])],rst[1][1][0:len(rst[1][1])],rst[2][1][0:len(rst[2][1])].encode('utf-8').decode('unicode_escape')]) shw1_categorystock.append([result_shw1[i][0][0:len(result_shw1[i][0])], result_shw1[i][1][0:len(result_shw1[i][1])], rst[0][1][1:len(rst[0][1])-1],rst[1][1][1:len(rst[1][1])-1], rst[2][1][1:len(rst[2][1])-1].encode('utf-8').decode('unicode_escape'), rst[-15][1], # "changepercent", round(float(rst[-15][1]),2) round(float(rst[-3][1]),2), # 总市值 round(float(rst[-2][1]),2), # 流通市值 rst[-1][1] # 换手率 ]) else: rst = [[x for x in ss.split(':')] for ss in singlestock_info] shw_one_stocks.append([rst[1][1][0:len(rst[1][1])],rst[2][1][0:len(rst[2][1])],rst[3][1][0:len(rst[3][1])].encode('utf-8').decode('unicode_escape')]) shw1_categorystock.append([result_shw1[i][0][0:len(result_shw1[i][0])], result_shw1[i][1][0:len(result_shw1[i][1])], rst[1][1][1:len(rst[1][1])-1],rst[2][1][1:len(rst[2][1])-1], rst[3][1][1:len(rst[3][1])-1].encode('utf-8').decode('unicode_escape'), rst[-15][1], # "changepercent", round(float(rst[-15][1]),2) round(float(rst[-3][1]),2), # 总市值 round(float(rst[-2][1]),2), # 流通市值 rst[-1][1] # 换手率 ]) tmp_removequotes = [result_shw1[i][0][0:len(result_shw1[i][0])],result_shw1[i][1][0:len(result_shw1[i][1])]] shw1_category_and_stocks.append([tmp_removequotes,shw_one_stocks]) time.sleep(random.randint(1,6)) #随机暂停秒数,防止抓取页面密集访问网站而被封# print('========显示前5条内容==============================')for i in range(0,5): # len(shw1_category_and_stocks) print(shw1_category_and_stocks[i][0]) print(shw1_category_and_stocks[i][1]) print()print()for i in range(0,5): # len(shw1_categorystock) print(shw1_categorystock[i])print()print('申万一级分类总数:',len(result_shw1))print('申万一级分类总数(包括各分类的股票):',len(shw1_categorystock))# 申万一级和二级分类数据写入文本文件shw1_category = [x[0][0] for x in shw1_category_and_stocks] shw1_code = [x[0][1] for x in shw1_category_and_stocks] dict1 = {'shw1_code': shw1_code,'shw1_category': shw1_category} df1 = pd.DataFrame(dict1) df1.to_csv('shenwan1_category.csv',index = False) # 申万一级分类文件shw1_category_code = [x[1] for x in shw1_categorystock] shw1_category_name = [x[0] for x in shw1_categorystock] shw1_category_mktcode = [x[2] for x in shw1_categorystock] shw1_stock_code = [x[3] for x in shw1_categorystock] shw1_stock_name = [x[4] for x in shw1_categorystock] shw1_stock_changepercent = [x[5] for x in shw1_categorystock] stock_mktcap = [x[6] for x in shw1_categorystock] stock_nmc = [x[7] for x in shw1_categorystock] stock_hsl = [x[8] for x in shw1_categorystock] dict2 = {'shw1_code': shw1_category_code,'category_name': shw1_category_name,'category_mktcode':shw1_category_mktcode,\ 'stock_code':shw1_stock_code,'stock_name':shw1_stock_name,'stock_changepercent':shw1_stock_changepercent,\ 'stock_mktcap':stock_mktcap,'stock_nmc':stock_nmc,'stock_hsl':stock_hsl} # df2 = pd.DataFrame(dict2) df2.to_csv('shenwan1_category_stocks.csv',index = False) # 申万二级分类文件
以上代码运行中,部分输出结果:抓取网页过程中,31个申万一级分类中的个股顺序申万一级分类及其所属个股申万一级分类信息文件, shenwan1_category.csv,如下图:申万一级分类及其所属股票信息文件,shenwan1_category_stocks.csv,可根据自己的需要获取相关属性值,如下图所示:其中的文件头标识及其说明(这是自己定义的):shw1_code (申万一级分类编码),category_name(分类名称),category_mktcode(市场代码股票代码),stock_code(股票代码),stock_name(股票名称),stock_changepercent(股价涨跌百分比),stock_mktcap(总市值),stock_nmc(流通市值),stock_hsl(换手率)抓取新浪财经申万二级分类信息的完整代码,在文章《A股行业申万一级和二级分类(含抓取新浪财经的python代码)》中,点击这里查看本文完(后续将发布《板块分析2/2 - 如何根据板块成交额的日数据变化判断板块轮动》)(图片来源网络,侵删)
0 评论