Hi Julien,
On 17:27 Sun 24 May , Julien Cristau wrote:
> Assuming this is tested in a jessie environment, please go ahead and
> upload.
>
Thank you for taking the time to review the changes.
Unfortunately, 2.12.3 still suffered from a serious bug present in all
2.12.x series that I would like to fix in jessie if possible. It's
#784620, which made our production cluster very difficult to work with,
unless we disabled the watcher and ran gnt-cluster verify only scarcely.
After a maintenance round with upstream, this was fixed in 2.12.4,
together with:
- Ganeti Issue #1082: RAPI is unresponsive after master-failover
(https://code.google.com/p/ganeti/issues/detail?id=1082)
- Ganeti Issue #1083: Cluster verify reports existing instance disks on
non-default VGs as missing
(https://code.google.com/p/ganeti/issues/detail?id=1083)
- Fixed a possible file descriptor leak when forking jobs
- Fixed missing private parameters in the environment for OS scripts
Thus, if it's not too much to ask, I would like to upload
2.12.4-1~deb8u1 to jessie-p-u instead. 2.12.4-1 is already in strech and
jessie-backports (but I'm planning to update sid to 2.13 soon-ish).
Full source interdiff since 2.12.3-1~deb8u1 attached. To ease review,
I'm also attaching a diff with the actual code changes only, excluding
the following:
doc/*
- Version bump and an update on a DRBD + Xen issue.
Makefile.am | 54 +++++++++++++++---
Makefile.in | 109 ++++++++++++++++++++++++-------------
configure | 41 +++++++++----
configure.ac | 6 +-
- GHC 7.8 compatibility. This was a patch in 2.12.3-1.
NEWS | 26 ++++++++
vcs-version | 2
- Version bump
test/py/cmdlib/cluster_unittest.py | 14 +++-
test/py/ganeti.backend_unittest.py | 35 +++++++++++
- Additional tests for orphan volumes plus some fixes in the test suite. We
don't run the test suite currently.
What remains is:
lib/backend.py | 5 +
lib/cmdlib/cluster.py | 11 +++
lib/config.py | 7 --
src/Ganeti/JQueue.hs | 5 +
src/Ganeti/Luxi.hs | 11 +++
src/Ganeti/Metad/ConfigServer.hs | 16 ++++-
src/Ganeti/Query/Exec.hs | 8 ++
src/Ganeti/UDSServer.hs | 9 +--
src/Ganeti/Utils.hs | 20 ++++--
src/Ganeti/WConfd/Server.hs | 13 ++++
As for the testing part, I've been running 2.12.4-1 in production
(including on Jessie systems) without issues.
Thanks,
Apollon
--- ganeti-2.12.3/debian/changelog 2015-04-29 18:13:02.000000000 +0300
+++ ganeti-2.12.4/debian/changelog 2015-05-25 11:16:24.000000000 +0300
@@ -1,4 +1,4 @@
-ganeti (2.12.3-0+deb8u1) jessie; urgency=medium
+ganeti (2.12.4-1~deb8u1) jessie; urgency=medium
[ Apollon Oikonomopoulos ]
* New upstream bugfix release (see /usr/share/doc/ganeti/NEWS.gz):
@@ -42,6 +42,13 @@
(> 255) minor numbers (closes: #782073)
+ Fix Luxid failure when DNS returns an IPv6 address that does not
reverse resolve
+
+ Fixes in 2.12.4:
+ + Fix a performance regression in 2.12 during gnt-cluster verify and
+ gnt-cluster verify-disks (high CPU usage) (closes: #784620).
+ + Make the RAPI responsive after master-failover.
+ + Fix gnt-cluster verify reporting existing instance disks on
+ non-default VGs as missing.
* Drop fix-wconfd-metad patch, merged upstream.
* d/copyright: adjust copyright years
@@ -53,7 +60,7 @@
* Dutch (Frans Spiesschaert, closes: #765856)
* Swedish (Martin Bagge, closes: #769870)
- -- Apollon Oikonomopoulos <apoikos@debian.org> Wed, 29 Apr 2015 14:06:45 +0300
+ -- Apollon Oikonomopoulos <apoikos@debian.org> Sun, 24 May 2015 20:07:22 +0300
ganeti (2.12.0-3) unstable; urgency=medium
--- ganeti-2.12.3/lib/backend.py 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/lib/backend.py 2015-05-12 12:10:35.000000000 +0300
@@ -1094,7 +1094,7 @@
if constants.NV_LVLIST in what and vm_capable:
try:
- val = GetVolumeList([what[constants.NV_LVLIST]])
+ val = GetVolumeList(utils.ListVolumeGroups().keys())
except RPCFail, err:
val = str(err)
result[constants.NV_LVLIST] = val
@@ -3234,7 +3234,8 @@
cannot be found
"""
- result = OSCoreEnv(instance.os, inst_os, instance.osparams, debug=debug)
+ result = OSCoreEnv(instance.os, inst_os, objects.FillDict(instance.osparams,
+ instance.osparams_private.Unprivate()), debug=debug)
for attr in ["name", "os", "uuid", "ctime", "mtime", "primary_node"]:
result["INSTANCE_%s" % attr.upper()] = str(getattr(instance, attr))
--- ganeti-2.12.3/lib/cmdlib/cluster.py 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/lib/cmdlib/cluster.py 2015-05-12 12:10:35.000000000 +0300
@@ -2621,12 +2621,15 @@
"instance lives on non-vm_capable node %s",
self.cfg.GetNodeName(node_uuid))
- def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
+ def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image,
+ reserved):
"""Verify if there are any unknown volumes in the cluster.
The .os, .swap and backup volumes are ignored. All other volumes are
reported as unknown.
+ @type vg_name: string
+ @param vg_name: the name of the Ganeti-administered volume group
@type reserved: L{ganeti.utils.FieldSet}
@param reserved: a FieldSet of reserved volume names
@@ -2637,6 +2640,10 @@
# skip non-healthy nodes
continue
for volume in n_img.volumes:
+ # skip volumes not belonging to the ganeti-administered volume group
+ if volume.split('/')[0] != vg_name:
+ continue
+
test = ((node_uuid not in node_vol_should or
volume not in node_vol_should[node_uuid]) and
not reserved.Matches(volume))
@@ -3757,7 +3764,7 @@
self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
break
- self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
+ self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved)
if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
feedback_fn("* Verifying N+1 Memory redundancy")
--- ganeti-2.12.3/lib/config.py 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/lib/config.py 2015-05-12 12:10:35.000000000 +0300
@@ -2824,13 +2824,10 @@
try:
if dict_data is not None:
self._SetConfigData(objects.ConfigData.FromDict(dict_data))
+ self._UpgradeConfig()
except Exception, err:
raise errors.ConfigurationError(err)
- # Transitional fix until ConfigWriter is completely rewritten into
- # Haskell
- self._UpgradeConfig()
-
def _CloseConfig(self, save):
"""Release resources relating the config data.
@@ -2844,7 +2841,7 @@
logging.critical("Can't write the configuration: %s", str(err))
raise
finally:
- if not self._offline:
+ if not self._offline and not self._lock_current_shared:
try:
self._wconfd.UnlockConfig(self._GetWConfdContext())
except AttributeError:
--- ganeti-2.12.3/src/Ganeti/JQueue.hs 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/JQueue.hs 2015-05-12 12:10:35.000000000 +0300
@@ -123,6 +123,7 @@
import Ganeti.Query.Exec as Exec
import Ganeti.Rpc (executeRpcCall, ERpcError, logRpcErrors,
RpcCallJobqueueUpdate(..), RpcCallJobqueueRename(..))
+import Ganeti.Runtime (GanetiDaemon(..), GanetiGroup(..), MiscGroup(..))
import Ganeti.Types
import Ganeti.Utils
import Ganeti.Utils.Atomic
@@ -643,8 +644,8 @@
-- | Permissions for the archive directories.
queueDirPermissions :: FilePermissions
-queueDirPermissions = FilePermissions { fpOwner = Just C.masterdUser
- , fpGroup = Just C.daemonsGroup
+queueDirPermissions = FilePermissions { fpOwner = Just GanetiMasterd
+ , fpGroup = Just $ ExtraGroup DaemonsGroup
, fpPermissions = 0o0750
}
--- ganeti-2.12.3/src/Ganeti/Luxi.hs 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/Luxi.hs 2015-05-12 12:10:35.000000000 +0300
@@ -76,10 +76,11 @@
import Ganeti.OpParams (pTagsObject)
import Ganeti.OpCodes
import qualified Ganeti.Query.Language as Qlang
-import Ganeti.Runtime (GanetiDaemon(..))
+import Ganeti.Runtime (GanetiDaemon(..), GanetiGroup(..), MiscGroup(..))
import Ganeti.THH
import Ganeti.THH.Field
import Ganeti.Types
+import Ganeti.Utils
-- | Currently supported Luxi operations and JSON serialization.
@@ -181,7 +182,13 @@
luxiConnectConfig :: ServerConfig
-luxiConnectConfig = ServerConfig GanetiLuxid
+luxiConnectConfig = ServerConfig
+ -- The rapi daemon talks to the luxi one, and for this
+ -- purpose we need group rw permissions.
+ FilePermissions { fpOwner = Just GanetiLuxid
+ , fpGroup = Just $ ExtraGroup DaemonsGroup
+ , fpPermissions = 0o0660
+ }
ConnectConfig { recvTmo = luxiDefRwto
, sendTmo = luxiDefRwto
}
--- ganeti-2.12.3/src/Ganeti/Metad/ConfigServer.hs 2015-04-28 15:47:28.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/Metad/ConfigServer.hs 2015-05-11 13:15:36.000000000 +0300
@@ -43,9 +43,10 @@
import Ganeti.Path as Path
import Ganeti.Daemon (DaemonOptions, cleanupSocket, describeError)
import qualified Ganeti.Logging as Logging
-import Ganeti.Runtime (GanetiDaemon(..))
+import Ganeti.Runtime (GanetiDaemon(..), GanetiGroup(..), MiscGroup(..))
import Ganeti.UDSServer (Client, ConnectConfig(..), Server, ServerConfig(..))
import qualified Ganeti.UDSServer as UDSServer
+import Ganeti.Utils (FilePermissions(..))
import Ganeti.Metad.Config as Config
import Ganeti.Metad.Types (InstanceParams)
@@ -100,4 +101,15 @@
(acceptClients config server)
(UDSServer.closeServer server)
where
- metadConfig = ServerConfig GanetiMetad $ ConnectConfig 60 60
+ metadConfig =
+ ServerConfig
+ -- The permission 0600 is completely acceptable because only the node
+ -- daemon talks to the metadata daemon, and the node daemon runs as
+ -- root.
+ FilePermissions { fpOwner = Just GanetiMetad
+ , fpGroup = Just $ ExtraGroup DaemonsGroup
+ , fpPermissions = 0o0600
+ }
+ ConnectConfig { recvTmo = 60
+ , sendTmo = 60
+ }
--- ganeti-2.12.3/src/Ganeti/Query/Exec.hs 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/Query/Exec.hs 2015-05-12 12:10:35.000000000 +0300
@@ -62,6 +62,7 @@
import Control.Concurrent (rtsSupportsBoundThreads)
import Control.Concurrent.Lifted (threadDelay)
+import Control.Exception (finally)
import Control.Monad
import Control.Monad.Error
import Data.Functor
@@ -194,8 +195,9 @@
forkWithPipe :: ConnectConfig -> (Client -> IO ()) -> IO (ProcessID, Client)
forkWithPipe conf childAction = do
(master, child) <- pipeClient conf
- pid <- forkProcess (closeClient master >> childAction child)
- closeClient child
+ pid <- finally
+ (forkProcess (closeClient master >> childAction child))
+ $ closeClient child
return (pid, master)
-- | Forks the job process and starts processing of the given job.
@@ -273,4 +275,6 @@
_ <- recv "Waiting for the job to ask for the lock file name"
send "Writing the lock file name to the client" lockfile
+ liftIO $ closeClient master
+
return (lockfile, pid)
--- ganeti-2.12.3/src/Ganeti/UDSServer.hs 2015-04-22 20:10:50.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/UDSServer.hs 2015-05-11 13:15:36.000000000 +0300
@@ -98,7 +98,6 @@
import Ganeti.Errors (GanetiException(..), ErrorResult)
import Ganeti.JSON
import Ganeti.Logging
-import Ganeti.Runtime (GanetiDaemon(..), MiscGroup(..), GanetiGroup(..))
import Ganeti.THH
import Ganeti.Utils
import Ganeti.Constants (privateParametersBlacklist)
@@ -143,7 +142,7 @@
-- Information required for creating a server connection.
data ServerConfig = ServerConfig
- { connDaemon :: GanetiDaemon
+ { connPermissions :: FilePermissions
, connConfig :: ConnectConfig
}
@@ -226,8 +225,10 @@
connectServer :: ServerConfig -> Bool -> FilePath -> IO Server
connectServer sconf setOwner path = do
s <- openServerSocket path
- when setOwner . setOwnerAndGroupFromNames path (connDaemon sconf) $
- ExtraGroup DaemonsGroup
+ when setOwner $ do
+ res <- ensurePermissions path (connPermissions sconf)
+ exitIfBad "Error - could not set socket properties" res
+
S.listen s 5 -- 5 is the max backlog
return Server { sSocket = s, sPath = path, serverConfig = connConfig sconf }
--- ganeti-2.12.3/src/Ganeti/Utils.hs 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/Utils.hs 2015-05-12 12:10:35.000000000 +0300
@@ -122,7 +122,6 @@
import System.Exit
import System.Posix.Files
import System.Posix.IO
-import System.Posix.User
import System.Time
-- * Debug functions
@@ -694,8 +693,8 @@
-- directories and files. All parameters are optional, with nothing
-- meaning that the default value should be left untouched.
-data FilePermissions = FilePermissions { fpOwner :: Maybe String
- , fpGroup :: Maybe String
+data FilePermissions = FilePermissions { fpOwner :: Maybe GanetiDaemon
+ , fpGroup :: Maybe GanetiGroup
, fpPermissions :: FileMode
}
@@ -703,22 +702,29 @@
-- possibly ownerships, as required.
ensurePermissions :: FilePath -> FilePermissions -> IO (Result ())
ensurePermissions fpath perms = do
+ -- Fetch the list of entities
+ runtimeEnts <- runResultT getEnts
+ ents <- exitIfBad "Can't determine user/group ids" runtimeEnts
+
+ -- Get the existing file properties
eitherFileStatus <- try $ getFileStatus fpath
:: IO (Either IOError FileStatus)
+
+ -- And see if any modifications are needed
(flip $ either (return . Bad . show)) eitherFileStatus $ \fstat -> do
ownertry <- case fpOwner perms of
Nothing -> return $ Right ()
Just owner -> try $ do
- ownerid <- userID `liftM` getUserEntryForName owner
+ let ownerid = reUserToUid ents M.! owner
unless (ownerid == fileOwner fstat) $ do
- logDebug $ "Changing owner of " ++ fpath ++ " to " ++ owner
+ logDebug $ "Changing owner of " ++ fpath ++ " to " ++ show owner
setOwnerAndGroup fpath ownerid (-1)
grouptry <- case fpGroup perms of
Nothing -> return $ Right ()
Just grp -> try $ do
- groupid <- groupID `liftM` getGroupEntryForName grp
+ let groupid = reGroupToGid ents M.! grp
unless (groupid == fileGroup fstat) $ do
- logDebug $ "Changing group of " ++ fpath ++ " to " ++ grp
+ logDebug $ "Changing group of " ++ fpath ++ " to " ++ show grp
setOwnerAndGroup fpath (-1) groupid
let fp = fpPermissions perms
permtry <- if fileMode fstat == fp
--- ganeti-2.12.3/src/Ganeti/WConfd/Server.hs 2015-04-29 11:09:35.000000000 +0300
+++ ganeti-2.12.4/src/Ganeti/WConfd/Server.hs 2015-05-12 12:10:35.000000000 +0300
@@ -55,6 +55,7 @@
import Ganeti.UDSServer
import Ganeti.Errors (formatError)
import Ganeti.Runtime
+import Ganeti.Utils
import Ganeti.Utils.Livelock (mkLivelockFile)
import Ganeti.WConfd.ConfigState
import Ganeti.WConfd.ConfigVerify
@@ -109,7 +110,17 @@
return (s, dh)
serverConfig :: ServerConfig
-serverConfig = ServerConfig GanetiWConfd $ ConnectConfig 60 60
+serverConfig = ServerConfig
+ -- All the daemons that need to talk to WConfd should be
+ -- running as the same user - the former master daemon user.
+ FilePermissions { fpOwner = Just GanetiWConfd
+ , fpGroup = Just $ ExtraGroup DaemonsGroup
+ , fpPermissions = 0o0600
+ }
+ ConnectConfig { recvTmo = 60
+ , sendTmo = 60
+ }
+
-- | Main function.
main :: MainFn () PrepResult
Attachment:
ganeti_2.12.3-1~deb8u1_2.12.4-1~deb8u1.debdiff.gz
Description: application/gzip
Attachment:
signature.asc
Description: Digital signature