volume: detect and drop volumes with disk IO error

from Jethro in slack: is it possible to make the assign request a bit smarter? Currently I’m in the state that a disk failed but all assign request are being send to this volume. It would be cool if the master sees this and stopped using this volume. e=HTTP(http://x:8089/913,045a782b63176edf) not 200 but 500 Internal Server Error Body={"size":740167,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"ee4381e202212ff3aee647704c036689"} e=HTTP(http://x:8089/913,045a782c90240077) not 200 but 500 Internal Server Error Body={"size":792779,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"c43463ccc11eb6eb2fc306f407a6a953"} e=HTTP(http://x:8089/913,045a782e6b7901ea) not 200 but 500 Internal Server Error Body={"size":3962392,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"04c91198e9b276c81f11dbf189af5d28"}
2025-10-21 16:27:25 +08:00 · 2020-11-28 00:09:29 -08:00
parent 9ac4935f22
commit 2c913dde04
3 changed files with 31 additions and 5 deletions
--- a/weed/storage/store.go
+++ b/weed/storage/store.go
@@ -221,7 +221,12 @@ func (s *Store) CollectHeartbeat() *master_pb.Heartbeat {
 				if v.expiredLongEnough(MAX_TTL_VOLUME_REMOVAL_DELAY) {
 					deleteVids = append(deleteVids, v.Id)
 				} else {
-					glog.V(0).Infoln("volume", v.Id, "is expired.")
+					glog.V(0).Infoln("volume %d is expired", v.Id)
+				}
+				if v.lastIoError != nil {
+					deleteVids = append(deleteVids, v.Id)
+				} else {
+					glog.Warningf("volume %d has IO error", v.Id)
 				}
 			}
 			collectionVolumeSize[v.Collection] += volumeMessage.Size