volume: detect and drop volumes with disk IO error

from Jethro in slack:
is it possible to make the assign request a bit smarter? Currently I’m in the state that a disk failed but all assign request are being send to this volume. It would be cool if the master sees this and stopped using this volume.
e=HTTP(http://x:8089/913,045a782b63176edf) not 200 but 500 Internal Server Error
Body={"size":740167,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"ee4381e202212ff3aee647704c036689"}
e=HTTP(http://x:8089/913,045a782c90240077) not 200 but 500 Internal Server Error
Body={"size":792779,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"c43463ccc11eb6eb2fc306f407a6a953"}
e=HTTP(http://x:8089/913,045a782e6b7901ea) not 200 but 500 Internal Server Error
Body={"size":3962392,"error":"failed to write to local disk: write /mnt/v9/913.dat: input/output error","eTag":"04c91198e9b276c81f11dbf189af5d28"}
This commit is contained in:
Chris Lu
2020-11-28 00:09:29 -08:00
parent 9ac4935f22
commit 2c913dde04
3 changed files with 31 additions and 5 deletions

View File

@@ -221,7 +221,12 @@ func (s *Store) CollectHeartbeat() *master_pb.Heartbeat {
if v.expiredLongEnough(MAX_TTL_VOLUME_REMOVAL_DELAY) {
deleteVids = append(deleteVids, v.Id)
} else {
glog.V(0).Infoln("volume", v.Id, "is expired.")
glog.V(0).Infoln("volume %d is expired", v.Id)
}
if v.lastIoError != nil {
deleteVids = append(deleteVids, v.Id)
} else {
glog.Warningf("volume %d has IO error", v.Id)
}
}
collectionVolumeSize[v.Collection] += volumeMessage.Size