如何解决大型数据集-无法轻松处理
我正在尝试找到最接近特定纬度和经度的站点。
# import requests
import json
# import matplotlib.pyplot as plt
# import seaborn as sns
# from pandasdmx import Request
# from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import pandas as pd
import numpy as np
import geopandas as gpd
import geopy.distance
import time
import csv
# import sys,os
# from sqlalchemy import create_engine
# import pymssql
# import sqlalchemy as db
# from functools import reduce
# import folium # map rendering library
# from sklearn.neighbors import NearestNeighbors
property_data_set = pd.read_csv('prop_data_set.csv')
shape_file_trains = "./shapefile/ptv_metro_train_station.shp"
trains_shape = gpd.read_file(shape_file_trains)
mwarp=property_data_set
# cartesian product so we get all combinations
dfdist = (mwarp.assign(foo=1).merge(trains_shape.assign(foo=1),on="foo")
# calc distance in km between each suburb and each train station
.assign(km=lambda dfa: dfa.apply(lambda r:
geopy.distance.geodesic(
(r["LATITUDE"],r["LONGITUDE"]),(r["lat"],r["lon"])).km,axis=1))
# reduce number of columns to make it more digestable
.loc[:,["postcode","address_street_full","STOP_ID","STOP_NAME","km"]]
# sort so shortest distance station from a suburb is first
.sort_values(["postcode","suburb","km"])
# good practice
.reset_index(drop=True)
)
# finally pick out stations nearest to suburb
# this can easily be joined back to source data frames as postcode and STOP_ID have been maintained
dfnearest = dfdist.groupby(["postcode","suburb"])\
.agg({"STOP_ID":"first","STOP_NAME":"first","km":"first"}).reset_index()
# print(dfnearest.to_string(index=False))
dfnearest.to_csv("distances_station")
print(dfnearest)
这是我当前正在使用的代码。我的计算机正在努力进行此分析,因为有11,000行数据。我想批量处理它,但不确定100%采取哪种最佳方法。任何人都做过类似的事情并且了解最佳途径吗?谢谢
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。